merge with 5.3

sql/sql_insert.cc: CREATE ... IF NOT EXISTS may do nothing, but it is still not a failure. don't forget to my_ok it. ****** CREATE ... IF NOT EXISTS may do nothing, but it is still not a failure. don't forget to my_ok it. sql/sql_table.cc: small cleanup ****** small cleanup
author: Sergei Golubchik <sergii@pisem.net> 2011-10-19 21:45:18 +0200
committer: Sergei Golubchik <sergii@pisem.net> 2011-10-19 21:45:18 +0200
commit: 76f0b94bb0b2994d639353530c5b251d0f1a204b (patch)
tree: 9ed50628aac34f89a37637bab2fc4915b86b5eb4 /sql
parent: 4e46d8e5bff140f2549841167dc4b65a3c0a645d (diff)
parent: 5dc1a2231f55bacc9aaf0e24816f3d9c2ee1f21d (diff)
download: mariadb-git-76f0b94bb0b2994d639353530c5b251d0f1a204b.tar.gz
169 files changed, 35286 insertions, 14853 deletions
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 73f7eef4a0a..90202d8aa7b 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -36,7 +36,6 @@ IF(SSL_DEFINES)
  ADD_DEFINITIONS(${SSL_DEFINES})
 ENDIF()
 
-
 SET (SQL_SOURCE
               ../sql-common/client.c derror.cc des_key_file.cc
                discover.cc ../libmysql/errmsg.c field.cc  field_conv.cc 
@@ -51,10 +50,6 @@ SET (SQL_SOURCE
                message.h mf_iocache.cc my_decimal.cc ../sql-common/my_time.c
                mysqld.cc net_serv.cc  keycaches.cc
                ../sql-common/client_plugin.c
-               create_options.cc
-               multi_range_read.cc opt_index_cond_pushdown.cc
-               opt_subselect.cc opt_table_elimination.cc
-               sql_expression_cache.cc sql_join_cache.cc
                opt_range.cc opt_range.h opt_sum.cc 
                ../sql-common/pack.c parse_file.cc password.c procedure.cc 
                protocol.cc records.cc repl_failsafe.cc rpl_filter.cc set_var.cc 
@@ -66,7 +61,7 @@ SET (SQL_SOURCE
                sql_list.cc sql_load.cc sql_manager.cc sql_parse.cc
                sql_partition.cc sql_plugin.cc sql_prepare.cc sql_rename.cc 
                debug_sync.cc debug_sync.h
-               sql_repl.cc sql_select.cc sql_show.cc sql_state.c sql_string.cc 
+               sql_repl.cc sql_select.cc sql_show.cc sql_state.c sql_string.cc
                sql_table.cc sql_test.cc sql_trigger.cc sql_udf.cc sql_union.cc
                sql_update.cc sql_view.cc strfunc.cc table.cc thr_malloc.cc 
                sql_time.cc tztime.cc uniques.cc unireg.cc item_xmlfunc.cc 
@@ -80,6 +75,13 @@ SET (SQL_SOURCE
                sql_signal.cc rpl_handler.cc mdl.cc sql_admin.cc
                transaction.cc sys_vars.cc sql_truncate.cc datadict.cc
                sql_reload.cc
+
+               # added in MariaDB:
+               sql_lifo_buffer.h sql_join_cache.h sql_join_cache.cc
+               create_options.cc multi_range_read.cc
+               opt_index_cond_pushdown.cc opt_subselect.cc
+               opt_table_elimination.cc sql_expression_cache.cc
+
                ${GEN_SOURCES}
                ${MYSYS_LIBWRAP_SOURCE})
 
diff --git a/sql/create_options.cc b/sql/create_options.cc
index e80b213cc81..8e46bb583a5 100644
--- a/sql/create_options.cc
+++ b/sql/create_options.cc
@@ -186,7 +186,8 @@ static bool set_one_value(ha_create_table_option *opt,
           *val= num;
           DBUG_RETURN(0);
         }
-        if (*end) *end++;
+        if (*end)
+          end++;
         start= end;
         num++;
       }
@@ -256,7 +257,7 @@ static const size_t ha_option_type_sizeof[]=
   @retval FALSE OK
 */
 
-my_bool parse_option_list(THD* thd, void **option_struct,
+my_bool parse_option_list(THD* thd, void *option_struct_arg,
                           engine_option_value *option_list,
                           ha_create_table_option *rules,
                           my_bool suppress_warning,
@@ -265,6 +266,7 @@ my_bool parse_option_list(THD* thd, void **option_struct,
   ha_create_table_option *opt;
   size_t option_struct_size= 0;
   engine_option_value *val= option_list;
+  void **option_struct= (void**)option_struct_arg;
   DBUG_ENTER("parse_option_list");
   DBUG_PRINT("enter",
              ("struct: 0x%lx list: 0x%lx rules: 0x%lx suppres %u root 0x%lx",
diff --git a/sql/create_options.h b/sql/create_options.h
index 1bd6ecd81e6..ae918f6cea1 100644
--- a/sql/create_options.h
+++ b/sql/create_options.h
@@ -71,7 +71,7 @@ class Create_field;
 
 my_bool parse_engine_table_options(THD *thd, handlerton *ht,
                                    TABLE_SHARE *share);
-my_bool parse_option_list(THD* thd, void **option_struct,
+my_bool parse_option_list(THD* thd, void *option_struct,
                           engine_option_value *option_list,
                           ha_create_table_option *rules,
                           my_bool suppress_warning,
diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc
index 7f69ae54037..7e5db3b499d 100644
--- a/sql/debug_sync.cc
+++ b/sql/debug_sync.cc
@@ -1691,7 +1691,8 @@ static void debug_sync_execute(THD *thd, st_debug_sync_action *action)
 
   if (action->execute)
   {
-    const char *UNINIT_VAR(old_proc_info);
+    const char  *old_proc_info;
+    LINT_INIT(old_proc_info);
 
     action->execute--;
 
@@ -1927,4 +1928,7 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len)
 }
 
 
+#else /* defined(ENABLED_DEBUG_SYNC) */
+/* prevent linker/lib warning about file without public symbols */
+int debug_sync_dummy; 
 #endif /* defined(ENABLED_DEBUG_SYNC) */
diff --git a/sql/derror.cc b/sql/derror.cc
index 2b4cc13073e..23319ac0c99 100644
--- a/sql/derror.cc
+++ b/sql/derror.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (C) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -28,6 +29,7 @@
 #include "derror.h"                             // read_texts
 #include "sql_class.h"                          // THD
 
+static bool check_error_mesg(const char *file_name, const char **errmsg);
 static void init_myfunc_errs(void);
 
 
@@ -44,9 +46,12 @@ C_MODE_END
   Read messages from errorfile.
 
   This function can be called multiple times to reload the messages.
-  If it fails to load the messages, it will fail softly by initializing
-  the errmesg pointer to an array of empty strings or by keeping the
-  old array if it exists.
+
+  If it fails to load the messages:
+   - If we already have error messages loaded, keep the old ones and
+     return FALSE(ok)
+  - Initializing the errmesg pointer to an array of empty strings
+    and return TRUE (error)
 
   @retval
     FALSE       OK
@@ -56,26 +61,45 @@ C_MODE_END
 
 bool init_errmessage(void)
 {
-  const char **errmsgs, **ptr;
+  const char **errmsgs, **ptr, **org_errmsgs;
+  bool error= FALSE;
   DBUG_ENTER("init_errmessage");
 
   /*
     Get a pointer to the old error messages pointer array.
     read_texts() tries to free it.
   */
-  errmsgs= my_error_unregister(ER_ERROR_FIRST, ER_ERROR_LAST);
+  org_errmsgs= my_error_unregister(ER_ERROR_FIRST, ER_ERROR_LAST);
 
   /* Read messages from file. */
   if (read_texts(ERRMSG_FILE, my_default_lc_messages->errmsgs->language,
                  &errmsgs, ER_ERROR_LAST - ER_ERROR_FIRST + 1) &&
       !errmsgs)
   {
-    if (!(errmsgs= (const char**) my_malloc((ER_ERROR_LAST-ER_ERROR_FIRST+1)*
-                                            sizeof(char*), MYF(0))))
-      DBUG_RETURN(TRUE);
-    for (ptr= errmsgs; ptr < errmsgs + ER_ERROR_LAST - ER_ERROR_FIRST; ptr++)
-	  *ptr= "";
+    free(errmsgs);
+    
+    if (org_errmsgs)
+    {
+      /* Use old error messages */
+      errmsgs= org_errmsgs;
+    }
+    else
+    {
+      /*
+        No error messages.  Create a temporary empty error message so
+        that we don't get a crash if some code wrongly tries to access
+        a non existing error message.
+      */
+      if (!(errmsgs= (const char**) my_malloc((ER_ERROR_LAST-ER_ERROR_FIRST+1)*
+                                              sizeof(char*), MYF(0))))
+        DBUG_RETURN(TRUE);
+      for (ptr= errmsgs; ptr < errmsgs + ER_ERROR_LAST - ER_ERROR_FIRST; ptr++)
+        *ptr= "";
+      error= TRUE;
+    }
   }
+  else
+    free(org_errmsgs);                        // Free old language
 
   /* Register messages for use with my_error(). */
   if (my_error_register(get_server_errmsgs, ER_ERROR_FIRST, ER_ERROR_LAST))
@@ -86,7 +110,29 @@ bool init_errmessage(void)
 
   DEFAULT_ERRMSGS= errmsgs;             /* Init global variable */
   init_myfunc_errs();			/* Init myfunc messages */
-  DBUG_RETURN(FALSE);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Check the error messages array contains all relevant error messages
+*/
+
+static bool check_error_mesg(const char *file_name, const char **errmsg)
+{
+  /*
+    The last MySQL error message can't be an empty string; If it is,
+    it means that the error file doesn't contain all MySQL messages
+    and is probably from an older version of MySQL / MariaDB.
+  */
+  if (errmsg[ER_LAST_MYSQL_ERROR_MESSAGE -1 - ER_ERROR_FIRST][0] == 0)
+  {
+    sql_print_error("Error message file '%s' is probably from and older "
+                    "version of MariaDB / MYSQL as it doesn't contain all "
+                    "error messages", file_name);
+    return 1;
+  }
+  return 0;
 }
 
 
@@ -107,8 +153,11 @@ bool read_texts(const char *file_name, const char *language,
   char lang_path[FN_REFLEN];
   uchar *buff;
   uchar head[32],*pos;
+  const char *errmsg;
   DBUG_ENTER("read_texts");
 
+  *point= 0;
+
   LINT_INIT(buff);
   funktpos=0;
   convert_dirname(lang_path, language, NullS);
@@ -136,6 +185,7 @@ bool read_texts(const char *file_name, const char *language,
   funktpos=1;
   if (mysql_file_read(file, (uchar*) head, 32, MYF(MY_NABP)))
     goto err;
+  funktpos=2;
   if (head[0] != (uchar) 254 || head[1] != (uchar) 254 ||
       head[2] != 2 || head[3] != 1)
     goto err; /* purecov: inspected */
@@ -147,20 +197,16 @@ bool read_texts(const char *file_name, const char *language,
   if (count < error_messages)
   {
     sql_print_error("\
-Error message file '%s' had only %d error messages,\n\
-but it should contain at least %d error messages.\n\
-Check that the above file is the right version for this program!",
+Error message file '%s' had only %d error messages, but it should contain at least %d error messages.\nCheck that the above file is the right version for this program!",
 		    name,count,error_messages);
     (void) mysql_file_close(file, MYF(MY_WME));
     DBUG_RETURN(1);
   }
 
-  /* Free old language */
-  my_free(*point);
   if (!(*point= (const char**)
 	my_malloc((size_t) (length+count*sizeof(char*)),MYF(0))))
   {
-    funktpos=2;					/* purecov: inspected */
+    funktpos=3;					/* purecov: inspected */
     goto err;					/* purecov: inspected */
   }
   buff= (uchar*) (*point + count);
@@ -180,12 +226,25 @@ Check that the above file is the right version for this program!",
     point[i]= *point +uint2korr(head+10+i+i);
   }
   (void) mysql_file_close(file, MYF(0));
-  DBUG_RETURN(0);
+
+  i= check_error_mesg(file_name, *point);
+  DBUG_RETURN(i);
 
 err:
-  sql_print_error((funktpos == 2) ? "Not enough memory for messagefile '%s'" :
-                  ((funktpos == 1) ? "Can't read from messagefile '%s'" :
-                   "Can't find messagefile '%s'"), name);
+  switch (funktpos) {
+  case 3:
+    errmsg= "Not enough memory for messagefile '%s'";
+    break;
+  case 2:
+    errmsg= "Incompatible header in messagefile '%s'. Probably from another version of MariaDB";
+  case 1:
+    errmsg= "Can't read from messagefile '%s'";
+    break;
+  default:
+    errmsg= "Can't find messagefile '%s'";
+    break;
+  }
+  sql_print_error(errmsg, name);
   if (file != FERR)
     (void) mysql_file_close(file, MYF(MY_WME));
   DBUG_RETURN(1);
diff --git a/sql/discover.cc b/sql/discover.cc
index f50f7deed99..b129747503e 100644
--- a/sql/discover.cc
+++ b/sql/discover.cc
@@ -70,7 +70,7 @@ int readfrm(const char *name, uchar **frmdata, size_t *len)
   error= 2;
   if (mysql_file_fstat(file, &state, MYF(0)))
     goto err;
-  read_len= state.st_size;  
+  read_len= (size_t)state.st_size;  
 
   // Read whole frm file
   error= 3;
diff --git a/sql/event_data_objects.cc b/sql/event_data_objects.cc
index 6d00d6fd74a..fc9ab35ac8a 100644
--- a/sql/event_data_objects.cc
+++ b/sql/event_data_objects.cc
@@ -475,7 +475,7 @@ Event_queue_element::load_from_row(THD *thd, TABLE *table)
     DBUG_RETURN(TRUE);
 
   starts_null= table->field[ET_FIELD_STARTS]->is_null();
-  my_bool not_used= FALSE;
+  uint not_used;
   if (!starts_null)
   {
     table->field[ET_FIELD_STARTS]->get_date(&time, TIME_NO_ZERO_DATE);
@@ -656,7 +656,7 @@ add_interval(MYSQL_TIME *ltime, const Time_zone *time_zone,
   if (date_add_interval(ltime, scale, interval))
     return 0;
 
-  my_bool not_used;
+  uint not_used;
   return time_zone->TIME_to_gmt_sec(ltime, &not_used);
 }
 
@@ -936,7 +936,7 @@ Event_queue_element::compute_next_execution_time()
     goto ret;
   }
 
-  time_now= (my_time_t) current_thd->query_start();
+  time_now= current_thd->query_start();
 
   DBUG_PRINT("info",("NOW: [%lu]", (ulong) time_now));
 
@@ -1136,7 +1136,7 @@ err:
 void
 Event_queue_element::mark_last_executed(THD *thd)
 {
-  last_executed= (my_time_t) thd->query_start();
+  last_executed= thd->query_start();
 
   execution_count++;
 }
@@ -1157,7 +1157,7 @@ append_datetime(String *buf, Time_zone *time_zone, my_time_t secs,
   */
   MYSQL_TIME time;
   time_zone->gmt_sec_to_TIME(&time, secs);
-  buf->append(dtime_buff, my_datetime_to_str(&time, dtime_buff));
+  buf->append(dtime_buff, my_datetime_to_str(&time, dtime_buff, 0));
   buf->append(STRING_WITH_LEN("'"));
 }
 
diff --git a/sql/event_db_repository.cc b/sql/event_db_repository.cc
index 83a9fdd083d..53f9727f489 100644
--- a/sql/event_db_repository.cc
+++ b/sql/event_db_repository.cc
@@ -221,7 +221,7 @@ mysql_event_fill_row(THD *thd,
       Safety: this can only happen if someone started the server
       and then altered mysql.event.
     */
-    my_error(ER_COL_COUNT_DOESNT_MATCH_CORRUPTED, MYF(0), table->alias,
+    my_error(ER_COL_COUNT_DOESNT_MATCH_CORRUPTED, MYF(0), table->alias.c_ptr(),
              (int) ET_FIELD_COUNT, table->s->fields);
     DBUG_RETURN(TRUE);
   }
@@ -248,6 +248,9 @@ mysql_event_fill_row(THD *thd,
     rs|= fields[ET_FIELD_STATUS]->store((longlong)et->status, TRUE);
   rs|= fields[ET_FIELD_ORIGINATOR]->store((longlong)et->originator, TRUE);
 
+  if (!is_update)
+    rs|= fields[ET_FIELD_CREATED]->set_time();
+
   /*
     Change the SQL_MODE only if body was present in an ALTER EVENT and of course
     always during CREATE EVENT.
@@ -294,7 +297,7 @@ mysql_event_fill_row(THD *thd,
       my_tz_OFFSET0->gmt_sec_to_TIME(&time, et->starts);
 
       fields[ET_FIELD_STARTS]->set_notnull();
-      fields[ET_FIELD_STARTS]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+      fields[ET_FIELD_STARTS]->store_time(&time);
     }
 
     if (!et->ends_null)
@@ -303,7 +306,7 @@ mysql_event_fill_row(THD *thd,
       my_tz_OFFSET0->gmt_sec_to_TIME(&time, et->ends);
 
       fields[ET_FIELD_ENDS]->set_notnull();
-      fields[ET_FIELD_ENDS]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+      fields[ET_FIELD_ENDS]->store_time(&time);
     }
   }
   else if (et->execute_at)
@@ -322,8 +325,7 @@ mysql_event_fill_row(THD *thd,
     my_tz_OFFSET0->gmt_sec_to_TIME(&time, et->execute_at);
 
     fields[ET_FIELD_EXECUTE_AT]->set_notnull();
-    fields[ET_FIELD_EXECUTE_AT]->
-                        store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    fields[ET_FIELD_EXECUTE_AT]->store_time(&time);
   }
   else
   {
@@ -334,7 +336,7 @@ mysql_event_fill_row(THD *thd,
     */
   }
 
-  ((Field_timestamp *)fields[ET_FIELD_MODIFIED])->set_time();
+  rs|= fields[ET_FIELD_MODIFIED]->set_time();
 
   if (et->comment.str)
   {
@@ -717,8 +719,6 @@ Event_db_repository::create_event(THD *thd, Event_parse_data *parse_data,
     goto end;
   }
 
-  ((Field_timestamp *)table->field[ET_FIELD_CREATED])->set_time();
-
   /*
     mysql_event_fill_row() calls my_error() in case of error so no need to
     handle it here
@@ -951,7 +951,11 @@ Event_db_repository::find_named_event(LEX_STRING db, LEX_STRING name,
     same fields.
   */
   if (db.length > table->field[ET_FIELD_DB]->field_length ||
-      name.length > table->field[ET_FIELD_NAME]->field_length)
+      name.length > table->field[ET_FIELD_NAME]->field_length ||
+      table->s->keys == 0 ||
+      table->key_info[0].key_parts != 2 ||
+      table->key_info[0].key_part[0].fieldnr != ET_FIELD_DB+1 ||
+      table->key_info[0].key_part[1].fieldnr != ET_FIELD_NAME+1)
     DBUG_RETURN(TRUE);
 
   table->field[ET_FIELD_DB]->store(db.str, db.length, &my_charset_bin);
@@ -1135,7 +1139,7 @@ update_timing_fields_for_event(THD *thd,
 
   my_tz_OFFSET0->gmt_sec_to_TIME(&time, last_executed);
   fields[ET_FIELD_LAST_EXECUTED]->set_notnull();
-  fields[ET_FIELD_LAST_EXECUTED]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+  fields[ET_FIELD_LAST_EXECUTED]->store_time(&time);
 
   fields[ET_FIELD_STATUS]->set_notnull();
   fields[ET_FIELD_STATUS]->store(status, TRUE);
diff --git a/sql/event_parse_data.cc b/sql/event_parse_data.cc
index d8c9847d0a0..8bffe47f30f 100644
--- a/sql/event_parse_data.cc
+++ b/sql/event_parse_data.cc
@@ -112,7 +112,7 @@ Event_parse_data::init_name(THD *thd, sp_name *spn)
 void
 Event_parse_data::check_if_in_the_past(THD *thd, my_time_t ltime_utc)
 {
-  if (ltime_utc >= (my_time_t) thd->query_start())
+  if (ltime_utc >= thd->query_start())
     return;
 
   /*
@@ -200,7 +200,7 @@ Event_parse_data::check_dates(THD *thd, int previous_on_completion)
 int
 Event_parse_data::init_execute_at(THD *thd)
 {
-  my_bool not_used;
+  uint not_used;
   MYSQL_TIME ltime;
   my_time_t ltime_utc;
 
@@ -217,7 +217,7 @@ Event_parse_data::init_execute_at(THD *thd)
                       (starts_null && ends_null)));
   DBUG_ASSERT(starts_null && ends_null);
 
-  if ((not_used= item_execute_at->get_date(&ltime, TIME_NO_ZERO_DATE)))
+  if (item_execute_at->get_date(&ltime, TIME_NO_ZERO_DATE))
     goto wrong_value;
 
   ltime_utc= TIME_to_timestamp(thd,&ltime,&not_used);
@@ -255,7 +255,6 @@ wrong_value:
 int
 Event_parse_data::init_interval(THD *thd)
 {
-  String value;
   INTERVAL interval_tmp;
 
   DBUG_ENTER("Event_parse_data::init_interval");
@@ -277,8 +276,7 @@ Event_parse_data::init_interval(THD *thd)
   if (item_expression->fix_fields(thd, &item_expression))
     goto wrong_value;
 
-  value.alloc(MAX_DATETIME_FULL_WIDTH*MY_CHARSET_BIN_MB_MAXLEN);
-  if (get_interval_value(item_expression, interval, &value, &interval_tmp))
+  if (get_interval_value(item_expression, interval, &interval_tmp))
     goto wrong_value;
 
   expression= 0;
@@ -370,7 +368,7 @@ wrong_value:
 int
 Event_parse_data::init_starts(THD *thd)
 {
-  my_bool not_used;
+  uint not_used;
   MYSQL_TIME ltime;
   my_time_t ltime_utc;
 
@@ -381,7 +379,7 @@ Event_parse_data::init_starts(THD *thd)
   if (item_starts->fix_fields(thd, &item_starts))
     goto wrong_value;
 
-  if ((not_used= item_starts->get_date(&ltime, TIME_NO_ZERO_DATE)))
+  if (item_starts->get_date(&ltime, TIME_NO_ZERO_DATE))
     goto wrong_value;
 
   ltime_utc= TIME_to_timestamp(thd, &ltime, &not_used);
@@ -424,7 +422,7 @@ wrong_value:
 int
 Event_parse_data::init_ends(THD *thd)
 {
-  my_bool not_used;
+  uint not_used;
   MYSQL_TIME ltime;
   my_time_t ltime_utc;
 
@@ -436,7 +434,7 @@ Event_parse_data::init_ends(THD *thd)
     goto error_bad_params;
 
   DBUG_PRINT("info", ("convert to TIME"));
-  if ((not_used= item_ends->get_date(&ltime, TIME_NO_ZERO_DATE)))
+  if (item_ends->get_date(&ltime, TIME_NO_ZERO_DATE))
     goto error_bad_params;
 
   ltime_utc= TIME_to_timestamp(thd, &ltime, &not_used);
diff --git a/sql/event_queue.cc b/sql/event_queue.cc
index 781f7fc4c4a..c92c3a835ba 100644
--- a/sql/event_queue.cc
+++ b/sql/event_queue.cc
@@ -531,9 +531,10 @@ Event_queue::empty_queue()
 */
 
 void
-Event_queue::dbug_dump_queue(time_t now)
+Event_queue::dbug_dump_queue(my_time_t when)
 {
 #ifndef DBUG_OFF
+  my_time_t now= when;
   Event_queue_element *et;
   uint i;
   DBUG_ENTER("Event_queue::dbug_dump_queue");
@@ -623,14 +624,12 @@ Event_queue::get_top_for_execution_if_time(THD *thd,
         Not yet time for top event, wait on condition with
         time or until signaled. Release LOCK_queue while waiting.
       */
-      struct timespec top_time;
-      set_timespec(top_time, next_activation_at - thd->query_start());
+      struct timespec top_time= { next_activation_at, 0 };
 
       /* Release any held audit resources before waiting */
       mysql_audit_release(thd);
 
       cond_wait(thd, &top_time, queue_wait_msg, SCHED_FUNC, __LINE__);
-
       continue;
     }
 
@@ -773,6 +772,7 @@ Event_queue::cond_wait(THD *thd, struct timespec *abstime, const char* msg,
 
   if (!thd->killed)
   {
+    DBUG_PRINT("info", ("pthread_cond_%swait", abstime ? "timed" : ""));
     if (!abstime)
       mysql_cond_wait(&COND_queue_state, &LOCK_event_queue);
     else
diff --git a/sql/event_queue.h b/sql/event_queue.h
index 93af03ba901..5e489ddaa37 100644
--- a/sql/event_queue.h
+++ b/sql/event_queue.h
@@ -107,7 +107,7 @@ private:
 
 
   void
-  dbug_dump_queue(time_t now);
+  dbug_dump_queue(my_time_t now);
 
   /* LOCK_event_queue is the mutex which protects the access to the queue. */
   mysql_mutex_t LOCK_event_queue;
diff --git a/sql/event_scheduler.cc b/sql/event_scheduler.cc
index bf16ddcb05a..ab6f0cdccaa 100644
--- a/sql/event_scheduler.cc
+++ b/sql/event_scheduler.cc
@@ -104,7 +104,7 @@ Event_worker_thread::print_warnings(THD *thd, Event_job_data *et)
                    err->get_message_octet_length(), system_charset_info);
     DBUG_ASSERT(err->get_level() < 3);
     (sql_print_message_handlers[err->get_level()])("%*s", err_msg.length(),
-                                                   err_msg.c_ptr());
+                                                   err_msg.c_ptr_safe());
   }
   DBUG_VOID_RETURN;
 }
@@ -665,7 +665,14 @@ Event_scheduler::stop()
     /* thd could be 0x0, when shutting down */
     sql_print_information("Event Scheduler: "
                           "Waiting for the scheduler thread to reply");
-    COND_STATE_WAIT(thd, NULL, "Waiting scheduler to stop");
+
+    /*
+      Wait only 2 seconds, as there is a small chance the thread missed the
+      above awake() call and we may have to do it again
+    */
+    struct timespec top_time;
+    set_timespec(top_time, 2);
+    COND_STATE_WAIT(thd, &top_time, "Waiting scheduler to stop");
   } while (state == STOPPING);
   DBUG_PRINT("info", ("Scheduler thread has cleaned up. Set state to INIT"));
   sql_print_information("Event Scheduler: Stopped");
diff --git a/sql/events.cc b/sql/events.cc
index 789d4414e75..631092f68e7 100644
--- a/sql/events.cc
+++ b/sql/events.cc
@@ -391,7 +391,7 @@ Events::create_event(THD *thd, Event_parse_data *parse_data,
           If the definer is not set or set to CURRENT_USER, the value of CURRENT_USER
           will be written into the binary log as the definer for the SQL thread.
         */
-        ret= write_bin_log(thd, TRUE, log_query.c_ptr(), log_query.length());
+        ret= write_bin_log(thd, TRUE, log_query.ptr(), log_query.length());
     }
   }
   /* Restore the state of binlog format */
@@ -666,7 +666,7 @@ send_show_create_event(THD *thd, Event_timed *et, Protocol *protocol)
   protocol->store(et->name.str, et->name.length, system_charset_info);
   protocol->store(sql_mode.str, sql_mode.length, system_charset_info);
   protocol->store(tz_name->ptr(), tz_name->length(), system_charset_info);
-  protocol->store(show_str.c_ptr(), show_str.length(),
+  protocol->store(show_str.ptr(), show_str.length(),
                   et->creation_ctx->get_client_cs());
   protocol->store(et->creation_ctx->get_client_cs()->csname,
                   strlen(et->creation_ctx->get_client_cs()->csname),
diff --git a/sql/examples/CMakeLists.txt b/sql/examples/CMakeLists.txt
index abe4de402bf..c4ea4c25679 100644
--- a/sql/examples/CMakeLists.txt
+++ b/sql/examples/CMakeLists.txt
@@ -13,9 +13,6 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 
-SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DSAFE_MUTEX")
-SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DSAFE_MUTEX")
-
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/sql
                     ${CMAKE_SOURCE_DIR}/extra/yassl/include
                     ${CMAKE_SOURCE_DIR}/regex)
diff --git a/sql/field.cc b/sql/field.cc
index 61786557531..bf4cb3e4ff9 100644
--- a/sql/field.cc
+++ b/sql/field.cc
@@ -1,5 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
-
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; version 2 of the License.
@@ -52,6 +52,17 @@ template class List<Create_field>;
 template class List_iterator<Create_field>;
 #endif
 
+static const char *zero_timestamp="0000-00-00 00:00:00.000000";
+
+/* number of bytes to store second_part part of the TIMESTAMP(N) */
+static uint sec_part_bytes[MAX_DATETIME_PRECISION+1]= { 0, 1, 1, 2, 2, 3, 3 };
+
+/* number of bytes to store DATETIME(N) */
+static uint datetime_hires_bytes[MAX_DATETIME_PRECISION+1]= { 5, 6, 6, 7, 7, 7, 8 };
+
+/* number of bytes to store TIME(N) */
+static uint time_hires_bytes[MAX_DATETIME_PRECISION+1]= { 3, 4, 4, 5, 5, 5, 6 };
+
 uchar Field_null::null[1]={1};
 const char field_separator=',';
 
@@ -76,7 +87,7 @@ const char field_separator=',';
 #define FIELDTYPE_TEAR_FROM (MYSQL_TYPE_BIT + 1)
 #define FIELDTYPE_TEAR_TO   (MYSQL_TYPE_NEWDECIMAL - 1)
 #define FIELDTYPE_NUM (FIELDTYPE_TEAR_FROM + (255 - FIELDTYPE_TEAR_TO))
-inline int field_type2index (enum_field_types field_type)
+static inline int field_type2index (enum_field_types field_type)
 {
   return (field_type < FIELDTYPE_TEAR_FROM ?
           field_type :
@@ -1431,13 +1442,6 @@ int Field::store(const char *to, uint length, CHARSET_INFO *cs,
    should be overridden. The other functions are just convenience
    functions and hence should not be overridden.
 
-   The value of <code>low_byte_first</code> is dependent on how the
-   packed data is going to be used: for local use, e.g., temporary
-   store on disk or in memory, use the native format since that is
-   faster. For data that is going to be transfered to other machines
-   (e.g., when writing data to the binary log), data should always be
-   stored in little-endian format.
-
    @note The default method for packing fields just copy the raw bytes
    of the record into the destination, but never more than
    <code>max_length</code> characters.
@@ -1455,15 +1459,9 @@ int Field::store(const char *to, uint length, CHARSET_INFO *cs,
    is 1000. This information is sometimes needed to decide how to pack
    the data.
 
-   @param low_byte_first
-   @c TRUE if integers should be stored little-endian, @c FALSE if
-   native format should be used. Note that for little-endian machines,
-   the value of this flag is a moot point since the native format is
-   little-endian.
 */
 uchar *
-Field::pack(uchar *to, const uchar *from, uint max_length,
-            bool low_byte_first __attribute__((unused)))
+Field::pack(uchar *to, const uchar *from, uint max_length)
 {
   uint32 length= pack_length();
   set_if_smaller(length, max_length);
@@ -1494,16 +1492,10 @@ Field::pack(uchar *to, const uchar *from, uint max_length,
    @param   param_data Real type and original pack length of the field
                        data
 
-   @param low_byte_first
-   If this flag is @c true, all composite entities (e.g., lengths)
-   should be unpacked in little-endian format; otherwise, the entities
-   are unpacked in native order.
-
    @return  New pointer into memory based on from + length of the data
 */
 const uchar *
-Field::unpack(uchar* to, const uchar *from, uint param_data,
-              bool low_byte_first __attribute__((unused)))
+Field::unpack(uchar* to, const uchar *from, uint param_data)
 {
   uint length=pack_length();
   int from_type= 0;
@@ -1564,9 +1556,9 @@ void Field::make_field(Send_field *field)
   }
   else
     field->org_table_name= field->db_name= "";
-  if (orig_table && orig_table->alias)
+  if (orig_table && orig_table->alias.ptr())
   {
-    field->table_name= orig_table->alias;
+    field->table_name= orig_table->alias.ptr();
     field->org_col_name= field_name;
   }
   else
@@ -1606,17 +1598,19 @@ longlong Field::convert_decimal2longlong(const my_decimal *val,
       i= 0;
       *err= 1;
     }
-    else if (warn_if_overflow(my_decimal2int(E_DEC_ERROR &
-                                           ~E_DEC_OVERFLOW & ~E_DEC_TRUNCATED,
-                                           val, TRUE, &i)))
+    else if (warn_if_overflow(my_decimal2int((E_DEC_ERROR &
+                                              ~E_DEC_OVERFLOW &
+                                              ~E_DEC_TRUNCATED),
+                                             val, TRUE, &i)))
     {
       i= ~(longlong) 0;
       *err= 1;
     }
   }
-  else if (warn_if_overflow(my_decimal2int(E_DEC_ERROR &
-                                         ~E_DEC_OVERFLOW & ~E_DEC_TRUNCATED,
-                                         val, FALSE, &i)))
+  else if (warn_if_overflow(my_decimal2int((E_DEC_ERROR &
+                                            ~E_DEC_OVERFLOW &
+                                            ~E_DEC_TRUNCATED),
+                                           val, FALSE, &i)))
   {
     i= (val->sign() ? LONGLONG_MIN : LONGLONG_MAX);
     *err= 1;
@@ -1776,16 +1770,6 @@ bool Field::get_date(MYSQL_TIME *ltime,uint fuzzydate)
   return 0;
 }
 
-bool Field::get_time(MYSQL_TIME *ltime)
-{
-  char buff[40];
-  String tmp(buff,sizeof(buff),&my_charset_bin),*res;
-  if (!(res=val_str(&tmp)) ||
-      str_to_time_with_warn(res->charset(), res->ptr(), res->length(), ltime))
-    return 1;
-  return 0;
-}
-
 /**
   This is called when storing a date in a string.
 
@@ -1793,11 +1777,11 @@ bool Field::get_time(MYSQL_TIME *ltime)
     Needs to be changed if/when we want to support different time formats.
 */
 
-int Field::store_time(MYSQL_TIME *ltime, timestamp_type type_arg)
+int Field::store_time_dec(MYSQL_TIME *ltime, uint dec)
 {
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
   char buff[MAX_DATE_STRING_REP_LENGTH];
-  uint length= (uint) my_TIME_to_str(ltime, buff);
+  uint length= (uint) my_TIME_to_str(ltime, buff, dec);
   /* Avoid conversion when field character set is ASCII compatible */
   return store(buff, length, (charset()->state & MY_CS_NONASCII) ?
                               &my_charset_latin1 : charset());
@@ -2766,10 +2750,10 @@ int Field_new_decimal::store_decimal(const my_decimal *decimal_value)
 }
 
 
-int Field_new_decimal::store_time(MYSQL_TIME *ltime, timestamp_type t_type)
+int Field_new_decimal::store_time_dec(MYSQL_TIME *ltime, uint dec)
 {
-    my_decimal decimal_value;
-    return store_value(date2my_decimal(ltime, &decimal_value));
+  my_decimal decimal_value;
+  return store_value(date2my_decimal(ltime, &decimal_value));
 }
 
 
@@ -2920,13 +2904,10 @@ uint Field_new_decimal::is_equal(Create_field *new_field)
    @return  New pointer into memory based on from + length of the data
 */
 const uchar *
-Field_new_decimal::unpack(uchar* to,
-                          const uchar *from,
-                          uint param_data,
-                          bool low_byte_first)
+Field_new_decimal::unpack(uchar* to, const uchar *from, uint param_data)
 {
   if (param_data == 0)
-    return Field::unpack(to, from, param_data, low_byte_first);
+    return Field::unpack(to, from, param_data);
 
   uint from_precision= (param_data & 0xff00) >> 8U;
   uint from_decimal= param_data & 0x00ff;
@@ -2960,6 +2941,15 @@ Field_new_decimal::unpack(uchar* to,
   return from+len;
 }
 
+int Field_num::store_time_dec(MYSQL_TIME *ltime, uint dec)
+{
+  longlong v= TIME_to_ulonglong(ltime);
+  if (ltime->neg == 0)
+    return store(v, true);
+  return store(-v, false);
+}
+
+
 /****************************************************************************
 ** tiny int
 ****************************************************************************/
@@ -3149,14 +3139,7 @@ int Field_short::store(const char *from,uint len,CHARSET_INFO *cs)
   
   error= get_int(cs, from, len, &rnd, UINT_MAX16, INT_MIN16, INT_MAX16);
   store_tmp= unsigned_flag ? (int) (ulonglong) rnd : (int) rnd;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int2store(ptr, store_tmp);
-  }
-  else
-#endif
-    shortstore(ptr, (short) store_tmp);
+  int2store(ptr, store_tmp);
   return error;
 }
 
@@ -3201,14 +3184,7 @@ int Field_short::store(double nr)
     else
       res=(int16) (int) nr;
   }
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int2store(ptr,res);
-  }
-  else
-#endif
-    shortstore(ptr,res);
+  int2store(ptr,res);
   return error;
 }
 
@@ -3256,14 +3232,7 @@ int Field_short::store(longlong nr, bool unsigned_val)
     else
       res=(int16) nr;
   }
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int2store(ptr,res);
-  }
-  else
-#endif
-    shortstore(ptr,res);
+  int2store(ptr,res);
   return error;
 }
 
@@ -3272,12 +3241,7 @@ double Field_short::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   short j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint2korr(ptr);
-  else
-#endif
-    shortget(j,ptr);
+  j=sint2korr(ptr);
   return unsigned_flag ? (double) (unsigned short) j : (double) j;
 }
 
@@ -3285,12 +3249,7 @@ longlong Field_short::val_int(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   short j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint2korr(ptr);
-  else
-#endif
-    shortget(j,ptr);
+  j=sint2korr(ptr);
   return unsigned_flag ? (longlong) (unsigned short) j : (longlong) j;
 }
 
@@ -3305,12 +3264,7 @@ String *Field_short::val_str(String *val_buffer,
   val_buffer->alloc(mlength);
   char *to=(char*) val_buffer->ptr();
   short j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint2korr(ptr);
-  else
-#endif
-    shortget(j,ptr);
+  j=sint2korr(ptr);
 
   if (unsigned_flag)
     length=(uint) cs->cset->long10_to_str(cs, to, mlength, 10, 
@@ -3334,18 +3288,8 @@ bool Field_short::send_binary(Protocol *protocol)
 int Field_short::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   short a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    a=sint2korr(a_ptr);
-    b=sint2korr(b_ptr);
-  }
-  else
-#endif
-  {
-    shortget(a,a_ptr);
-    shortget(b,b_ptr);
-  }
+  a=sint2korr(a_ptr);
+  b=sint2korr(b_ptr);
 
   if (unsigned_flag)
     return ((unsigned short) a < (unsigned short) b) ? -1 :
@@ -3355,24 +3299,11 @@ int Field_short::cmp(const uchar *a_ptr, const uchar *b_ptr)
 
 void Field_short::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table->s->db_low_byte_first)
-  {
-    if (unsigned_flag)
-      to[0] = ptr[0];
-    else
-      to[0] = (char) (ptr[0] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[1];
-  }
+  if (unsigned_flag)
+    to[0] = ptr[1];
   else
-#endif
-  {
-    if (unsigned_flag)
-      to[0] = ptr[1];
-    else
-      to[0] = (char) (ptr[1] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[0];
-  }
+    to[0] = (char) (ptr[1] ^ 128);              /* Revers signbit */
+  to[1]   = ptr[0];
 }
 
 void Field_short::sql_type(String &res) const
@@ -3588,14 +3519,7 @@ int Field_long::store(const char *from,uint len,CHARSET_INFO *cs)
   
   error= get_int(cs, from, len, &rnd, UINT_MAX32, INT_MIN32, INT_MAX32);
   store_tmp= unsigned_flag ? (long) (ulonglong) rnd : (long) rnd;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int4store(ptr, store_tmp);
-  }
-  else
-#endif
-    longstore(ptr, store_tmp);
+  int4store(ptr, store_tmp);
   return error;
 }
 
@@ -3640,14 +3564,7 @@ int Field_long::store(double nr)
   if (error)
     set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
 
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int4store(ptr,res);
-  }
-  else
-#endif
-    longstore(ptr,res);
+  int4store(ptr,res);
   return error;
 }
 
@@ -3693,14 +3610,7 @@ int Field_long::store(longlong nr, bool unsigned_val)
   if (error)
     set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
 
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int4store(ptr,res);
-  }
-  else
-#endif
-    longstore(ptr,res);
+  int4store(ptr,res);
   return error;
 }
 
@@ -3709,12 +3619,7 @@ double Field_long::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   int32 j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint4korr(ptr);
-  else
-#endif
-    longget(j,ptr);
+  j=sint4korr(ptr);
   return unsigned_flag ? (double) (uint32) j : (double) j;
 }
 
@@ -3724,12 +3629,7 @@ longlong Field_long::val_int(void)
   int32 j;
   /* See the comment in Field_long::store(long long) */
   DBUG_ASSERT(table->in_use == current_thd);
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint4korr(ptr);
-  else
-#endif
-    longget(j,ptr);
+  j=sint4korr(ptr);
   return unsigned_flag ? (longlong) (uint32) j : (longlong) j;
 }
 
@@ -3743,12 +3643,7 @@ String *Field_long::val_str(String *val_buffer,
   val_buffer->alloc(mlength);
   char *to=(char*) val_buffer->ptr();
   int32 j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint4korr(ptr);
-  else
-#endif
-    longget(j,ptr);
+  j=sint4korr(ptr);
 
   if (unsigned_flag)
     length=cs->cset->long10_to_str(cs,to,mlength, 10,(long) (uint32)j);
@@ -3771,18 +3666,8 @@ bool Field_long::send_binary(Protocol *protocol)
 int Field_long::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   int32 a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    a=sint4korr(a_ptr);
-    b=sint4korr(b_ptr);
-  }
-  else
-#endif
-  {
-    longget(a,a_ptr);
-    longget(b,b_ptr);
-  }
+  a=sint4korr(a_ptr);
+  b=sint4korr(b_ptr);
   if (unsigned_flag)
     return ((uint32) a < (uint32) b) ? -1 : ((uint32) a > (uint32) b) ? 1 : 0;
   return (a < b) ? -1 : (a > b) ? 1 : 0;
@@ -3790,28 +3675,13 @@ int Field_long::cmp(const uchar *a_ptr, const uchar *b_ptr)
 
 void Field_long::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table->s->db_low_byte_first)
-  {
-    if (unsigned_flag)
-      to[0] = ptr[0];
-    else
-      to[0] = (char) (ptr[0] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[1];
-    to[2]   = ptr[2];
-    to[3]   = ptr[3];
-  }
+  if (unsigned_flag)
+    to[0] = ptr[3];
   else
-#endif
-  {
-    if (unsigned_flag)
-      to[0] = ptr[3];
-    else
-      to[0] = (char) (ptr[3] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[2];
-    to[2]   = ptr[1];
-    to[3]   = ptr[0];
-  }
+    to[0] = (char) (ptr[3] ^ 128);              /* Revers signbit */
+  to[1]   = ptr[2];
+  to[2]   = ptr[1];
+  to[3]   = ptr[0];
 }
 
 
@@ -3845,14 +3715,7 @@ int Field_longlong::store(const char *from,uint len,CHARSET_INFO *cs)
     error= 1;
   else
     error= 0;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int8store(ptr,tmp);
-  }
-  else
-#endif
-    longlongstore(ptr,tmp);
+  int8store(ptr,tmp);
   return error;
 }
 
@@ -3860,51 +3723,15 @@ int Field_longlong::store(const char *from,uint len,CHARSET_INFO *cs)
 int Field_longlong::store(double nr)
 {
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  int error= 0;
+  bool error;
   longlong res;
 
-  nr= rint(nr);
-  if (unsigned_flag)
-  {
-    if (nr < 0)
-    {
-      res=0;
-      error= 1;
-    }
-    else if (nr >= (double) ULONGLONG_MAX)
-    {
-      res= ~(longlong) 0;
-      error= 1;
-    }
-    else
-      res=(longlong) double2ulonglong(nr);
-  }
-  else
-  {
-    if (nr <= (double) LONGLONG_MIN)
-    {
-      res= LONGLONG_MIN;
-      error= (nr < (double) LONGLONG_MIN);
-    }
-    else if (nr >= (double) (ulonglong) LONGLONG_MAX)
-    {
-      res= LONGLONG_MAX;
-      error= (nr > (double) LONGLONG_MAX);
-    }
-    else
-      res=(longlong) nr;
-  }
+  res= double_to_longlong(nr, unsigned_flag, &error);
+
   if (error)
     set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
 
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int8store(ptr,res);
-  }
-  else
-#endif
-    longlongstore(ptr,res);
+  int8store(ptr,res);
   return error;
 }
 
@@ -3928,14 +3755,7 @@ int Field_longlong::store(longlong nr, bool unsigned_val)
     }
   }
 
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int8store(ptr,nr);
-  }
-  else
-#endif
-    longlongstore(ptr,nr);
+  int8store(ptr,nr);
   return error;
 }
 
@@ -3944,14 +3764,7 @@ double Field_longlong::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   longlong j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    j=sint8korr(ptr);
-  }
-  else
-#endif
-    longlongget(j,ptr);
+  j=sint8korr(ptr);
   /* The following is open coded to avoid a bug in gcc 3.3 */
   if (unsigned_flag)
   {
@@ -3966,12 +3779,7 @@ longlong Field_longlong::val_int(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   longlong j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint8korr(ptr);
-  else
-#endif
-    longlongget(j,ptr);
+  j=sint8korr(ptr);
   return j;
 }
 
@@ -3985,12 +3793,7 @@ String *Field_longlong::val_str(String *val_buffer,
   val_buffer->alloc(mlength);
   char *to=(char*) val_buffer->ptr();
   longlong j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-    j=sint8korr(ptr);
-  else
-#endif
-    longlongget(j,ptr);
+  j=sint8korr(ptr);
 
   length=(uint) (cs->cset->longlong10_to_str)(cs,to,mlength,
 					unsigned_flag ? 10 : -10, j);
@@ -4012,18 +3815,8 @@ bool Field_longlong::send_binary(Protocol *protocol)
 int Field_longlong::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   longlong a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    a=sint8korr(a_ptr);
-    b=sint8korr(b_ptr);
-  }
-  else
-#endif
-  {
-    longlongget(a,a_ptr);
-    longlongget(b,b_ptr);
-  }
+  a=sint8korr(a_ptr);
+  b=sint8korr(b_ptr);
   if (unsigned_flag)
     return ((ulonglong) a < (ulonglong) b) ? -1 :
     ((ulonglong) a > (ulonglong) b) ? 1 : 0;
@@ -4032,36 +3825,17 @@ int Field_longlong::cmp(const uchar *a_ptr, const uchar *b_ptr)
 
 void Field_longlong::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table->s->db_low_byte_first)
-  {
-    if (unsigned_flag)
-      to[0] = ptr[0];
-    else
-      to[0] = (char) (ptr[0] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[1];
-    to[2]   = ptr[2];
-    to[3]   = ptr[3];
-    to[4]   = ptr[4];
-    to[5]   = ptr[5];
-    to[6]   = ptr[6];
-    to[7]   = ptr[7];
-  }
+  if (unsigned_flag)
+    to[0] = ptr[7];
   else
-#endif
-  {
-    if (unsigned_flag)
-      to[0] = ptr[7];
-    else
-      to[0] = (char) (ptr[7] ^ 128);		/* Revers signbit */
-    to[1]   = ptr[6];
-    to[2]   = ptr[5];
-    to[3]   = ptr[4];
-    to[4]   = ptr[3];
-    to[5]   = ptr[2];
-    to[6]   = ptr[1];
-    to[7]   = ptr[0];
-  }
+    to[0] = (char) (ptr[7] ^ 128);		/* Revers signbit */
+  to[1]   = ptr[6];
+  to[2]   = ptr[5];
+  to[3]   = ptr[4];
+  to[4]   = ptr[3];
+  to[5]   = ptr[2];
+  to[6]   = ptr[1];
+  to[7]   = ptr[0];
 }
 
 
@@ -4078,43 +3852,6 @@ void Field_longlong::sql_type(String &res) const
   Floating-point numbers
  */
 
-uchar *
-Field_real::pack(uchar *to, const uchar *from,
-                 uint max_length, bool low_byte_first)
-{
-  DBUG_ENTER("Field_real::pack");
-  DBUG_ASSERT(max_length >= pack_length());
-#ifdef WORDS_BIGENDIAN
-  if (low_byte_first != table->s->db_low_byte_first)
-  {
-    const uchar *dptr= from + pack_length();
-    while (dptr-- > from)
-      *to++ = *dptr;
-    DBUG_RETURN(to);
-  }
-  else
-#endif
-    DBUG_RETURN(Field::pack(to, from, max_length, low_byte_first));
-}
-
-const uchar *
-Field_real::unpack(uchar *to, const uchar *from,
-                   uint param_data, bool low_byte_first)
-{
-  DBUG_ENTER("Field_real::unpack");
-#ifdef WORDS_BIGENDIAN
-  if (low_byte_first != table->s->db_low_byte_first)
-  {
-    const uchar *dptr= from + pack_length();
-    while (dptr-- > from)
-      *to++ = *dptr;
-    DBUG_RETURN(from + pack_length());
-  }
-  else
-#endif
-    DBUG_RETURN(Field::unpack(to, from, param_data, low_byte_first));
-}
-
 /****************************************************************************
   single precision float
 ****************************************************************************/
@@ -4139,17 +3876,21 @@ int Field_float::store(const char *from,uint len,CHARSET_INFO *cs)
 int Field_float::store(double nr)
 {
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  int error= truncate(&nr, FLT_MAX);
-  float j= (float)nr;
-
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
+  int error= truncate_double(&nr, field_length,
+                             not_fixed ? NOT_FIXED_DEC : dec,
+                             unsigned_flag, FLT_MAX);
+  if (error)
   {
-    float4store(ptr,j);
+    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
+    if (error < 0)                                // Wrong double value
+    {
+      error= 1;
+      set_null();
+    }
   }
-  else
-#endif
-    memcpy(ptr, &j, sizeof(j));
+  float j= (float)nr;
+
+  float4store(ptr,j);
   return error;
 }
 
@@ -4165,28 +3906,14 @@ double Field_float::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   float j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float4get(j,ptr);
-  }
-  else
-#endif
-    memcpy(&j, ptr, sizeof(j));
+  float4get(j,ptr);
   return ((double) j);
 }
 
 longlong Field_float::val_int(void)
 {
   float j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float4get(j,ptr);
-  }
-  else
-#endif
-    memcpy(&j, ptr, sizeof(j));
+  float4get(j,ptr);
   return (longlong) rint(j);
 }
 
@@ -4197,14 +3924,7 @@ String *Field_float::val_str(String *val_buffer,
   ASSERT_COLUMN_MARKED_FOR_READ;
   DBUG_ASSERT(!zerofill || field_length <= MAX_FIELD_CHARLENGTH);
   float nr;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float4get(nr,ptr);
-  }
-  else
-#endif
-    memcpy(&nr, ptr, sizeof(nr));
+  float4get(nr,ptr);
 
   uint to_length= 70;
   if (val_buffer->alloc(to_length))
@@ -4238,18 +3958,8 @@ String *Field_float::val_str(String *val_buffer,
 int Field_float::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   float a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float4get(a,a_ptr);
-    float4get(b,b_ptr);
-  }
-  else
-#endif
-  {
-    memcpy(&a, a_ptr, sizeof(float));
-    memcpy(&b, b_ptr, sizeof(float));
-  }
+  float4get(a,a_ptr);
+  float4get(b,b_ptr);
   return (a < b) ? -1 : (a > b) ? 1 : 0;
 }
 
@@ -4258,14 +3968,7 @@ int Field_float::cmp(const uchar *a_ptr, const uchar *b_ptr)
 void Field_float::sort_string(uchar *to,uint length __attribute__((unused)))
 {
   float nr;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float4get(nr,ptr);
-  }
-  else
-#endif
-    memcpy(&nr, ptr, sizeof(float));
+  float4get(nr,ptr);
 
   uchar *tmp= to;
   if (nr == (float) 0.0)
@@ -4361,16 +4064,20 @@ int Field_double::store(const char *from,uint len,CHARSET_INFO *cs)
 int Field_double::store(double nr)
 {
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  int error= truncate(&nr, DBL_MAX);
-
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
+  int error= truncate_double(&nr, field_length,
+                             not_fixed ? NOT_FIXED_DEC : dec,
+                             unsigned_flag, DBL_MAX);
+  if (error)
   {
-    float8store(ptr,nr);
+    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
+    if (error < 0)                                // Wrong double value
+    {
+      error= 1;
+      set_null();
+    }
   }
-  else
-#endif
-    doublestore(ptr,nr);
+
+  float8store(ptr,nr);
   return error;
 }
 
@@ -4385,28 +4092,31 @@ int Field_double::store(longlong nr, bool unsigned_val)
   If a field has fixed length, truncate the double argument pointed to by 'nr'
   appropriately.
   Also ensure that the argument is within [-max_value; max_value] range.
+
+  return
+    0   ok
+    -1  Illegal double value
+    1   Value was truncated
 */
 
-int Field_real::truncate(double *nr, double max_value)
+int truncate_double(double *nr, uint field_length, uint dec,
+                    bool unsigned_flag, double max_value)
 {
-  int error= 1;
+  int error= 0;
   double res= *nr;
   
   if (isnan(res))
   {
-    res= 0;
-    set_null();
-    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
-    goto end;
+    *nr= 0;
+    return -1;
   }
   else if (unsigned_flag && res < 0)
   {
-    res= 0;
-    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
-    goto end;
+    *nr= 0;
+    return 1;
   }
 
-  if (!not_fixed)
+  if (dec < NOT_FIXED_DEC)
   {
     uint order= field_length - dec;
     uint step= array_elements(log_10) - 1;
@@ -4426,22 +4136,70 @@ int Field_real::truncate(double *nr, double max_value)
   
   if (res < -max_value)
   {
-   res= -max_value;
-   set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
+    res= -max_value;
+    error= 1;
   }
   else if (res > max_value)
   {
     res= max_value;
-    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE, 1);
+    error= 1;
   }
-  else
-    error= 0;
 
-end:
   *nr= res;
   return error;
 }
 
+/*
+  Convert double to longlong / ulonglong.
+  If double is outside of range, adjust return value and set error.
+
+  SYNOPSIS
+  double_to_longlong()
+  nr	  	 Number to convert
+  unsigned_flag  1 if result is unsigned
+  error		 Will be set to 1 in case of overflow.
+*/
+
+longlong double_to_longlong(double nr, bool unsigned_flag, bool *error)
+{
+  longlong res;
+
+  *error= 0;
+
+  nr= rint(nr);
+  if (unsigned_flag)
+  {
+    if (nr < 0)
+    {
+      res= 0;
+      *error= 1;
+    }
+    else if (nr >= (double) ULONGLONG_MAX)
+    {
+      res= ~(longlong) 0;
+      *error= 1;
+    }
+    else
+      res= (longlong) double2ulonglong(nr);
+  }
+  else
+  {
+    if (nr <= (double) LONGLONG_MIN)
+    {
+      res= LONGLONG_MIN;
+      *error= (nr < (double) LONGLONG_MIN);
+    }
+    else if (nr >= (double) (ulonglong) LONGLONG_MAX)
+    {
+      res= LONGLONG_MAX;
+      *error= (nr > (double) LONGLONG_MAX);
+    }
+    else
+      res= (longlong) nr;
+  }
+  return res;
+}
+
 
 int Field_real::store_decimal(const my_decimal *dm)
 {
@@ -4450,18 +4208,17 @@ int Field_real::store_decimal(const my_decimal *dm)
   return store(dbl);
 }
 
+int Field_real::store_time_dec(MYSQL_TIME *ltime, uint dec)
+{
+  return store(TIME_to_double(ltime));
+}
+
+
 double Field_double::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   double j;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float8get(j,ptr);
-  }
-  else
-#endif
-    doubleget(j,ptr);
+  float8get(j,ptr);
   return j;
 }
 
@@ -4470,33 +4227,13 @@ longlong Field_double::val_int(void)
   ASSERT_COLUMN_MARKED_FOR_READ;
   double j;
   longlong res;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float8get(j,ptr);
-  }
-  else
-#endif
-    doubleget(j,ptr);
-  /* Check whether we fit into longlong range */
-  if (j <= (double) LONGLONG_MIN)
-  {
-    res= (longlong) LONGLONG_MIN;
-    goto warn;
-  }
-  if (j >= (double) (ulonglong) LONGLONG_MAX)
-  {
-    res= (longlong) LONGLONG_MAX;
-    goto warn;
-  }
-  return (longlong) rint(j);
+  bool error;
+  float8get(j,ptr);
 
-warn:
+  res= double_to_longlong(j, 0, &error);
+  if (error)
   {
-    char buf[DOUBLE_TO_STRING_CONVERSION_BUFFER_SIZE];
-    String tmp(buf, sizeof(buf), &my_charset_latin1), *str;
-    str= val_str(&tmp, 0);
-    ErrConvString err(str);
+    ErrConvDouble err(j);
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                         ER_TRUNCATED_WRONG_VALUE,
                         ER(ER_TRUNCATED_WRONG_VALUE), "INTEGER",
@@ -4514,20 +4251,22 @@ my_decimal *Field_real::val_decimal(my_decimal *decimal_value)
 }
 
 
+bool Field_real::get_date(MYSQL_TIME *ltime,uint fuzzydate)
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  double nr= val_real();
+  return double_to_datetime_with_warn(nr, ltime, fuzzydate, field_name);
+}
+
+
 String *Field_double::val_str(String *val_buffer,
 			      String *val_ptr __attribute__((unused)))
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   DBUG_ASSERT(!zerofill || field_length <= MAX_FIELD_CHARLENGTH);
   double nr;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float8get(nr,ptr);
-  }
-  else
-#endif
-    doubleget(nr,ptr);
+  float8get(nr,ptr);
+
   uint to_length= DOUBLE_TO_STRING_CONVERSION_BUFFER_SIZE;
   if (val_buffer->alloc(to_length))
   {
@@ -4559,18 +4298,8 @@ bool Field_double::send_binary(Protocol *protocol)
 int Field_double::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   double a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float8get(a,a_ptr);
-    float8get(b,b_ptr);
-  }
-  else
-#endif
-  {
-    doubleget(a, a_ptr);
-    doubleget(b, b_ptr);
-  }
+  float8get(a,a_ptr);
+  float8get(b,b_ptr);
   return (a < b) ? -1 : (a > b) ? 1 : 0;
 }
 
@@ -4582,14 +4311,7 @@ int Field_double::cmp(const uchar *a_ptr, const uchar *b_ptr)
 void Field_double::sort_string(uchar *to,uint length __attribute__((unused)))
 {
   double nr;
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    float8get(nr,ptr);
-  }
-  else
-#endif
-    doubleget(nr,ptr);
+  float8get(nr,ptr);
   change_double_for_sort(nr, to);
 }
 
@@ -4677,12 +4399,12 @@ Field_timestamp::Field_timestamp(uchar *ptr_arg, uint32 len_arg,
 				 const char *field_name_arg,
 				 TABLE_SHARE *share,
 				 CHARSET_INFO *cs)
-  :Field_str(ptr_arg, MAX_DATETIME_WIDTH, null_ptr_arg, null_bit_arg,
+  :Field_str(ptr_arg, len_arg, null_ptr_arg, null_bit_arg,
 	     unireg_check_arg, field_name_arg, cs)
 {
   /* For 4.0 MYD and 4.0 InnoDB compatibility */
-  flags|= ZEROFILL_FLAG | UNSIGNED_FLAG | BINARY_FLAG;
-  if (!share->timestamp_field && unireg_check != NONE)
+  flags|= UNSIGNED_FLAG | BINARY_FLAG;
+  if (unireg_check != NONE && !share->timestamp_field)
   {
     /* This timestamp has auto-update */
     share->timestamp_field= this;
@@ -4693,20 +4415,6 @@ Field_timestamp::Field_timestamp(uchar *ptr_arg, uint32 len_arg,
 }
 
 
-Field_timestamp::Field_timestamp(bool maybe_null_arg,
-                                 const char *field_name_arg,
-                                 CHARSET_INFO *cs)
-  :Field_str((uchar*) 0, MAX_DATETIME_WIDTH,
-             maybe_null_arg ? (uchar*) "": 0, 0,
-	     NONE, field_name_arg, cs)
-{
-  /* For 4.0 MYD and 4.0 InnoDB compatibility */
-  flags|= ZEROFILL_FLAG | UNSIGNED_FLAG | BINARY_FLAG;
-    if (unireg_check != TIMESTAMP_DN_FIELD)
-      flags|= ON_UPDATE_NOW_FLAG;
-}
-
-
 /**
   Get auto-set type for TIMESTAMP field.
 
@@ -4741,175 +4449,150 @@ timestamp_auto_set_type Field_timestamp::get_auto_set_type() const
   }
 }
 
+my_time_t Field_timestamp::get_timestamp(ulong *sec_part) const
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  *sec_part= 0;
+  return sint4korr(ptr);
+}
+
 
-int Field_timestamp::store(const char *from,uint len,CHARSET_INFO *cs)
+int Field_timestamp::store_TIME_with_warning(THD *thd, MYSQL_TIME *l_time,
+                                             const ErrConv *str,
+                                             bool was_cut,
+                                             bool have_smth_to_conv)
 {
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME l_time;
-  my_time_t tmp= 0;
-  int error;
-  bool have_smth_to_conv;
-  my_bool in_dst_time_gap;
-  THD *thd= table ? table->in_use : current_thd;
-
-  /* We don't want to store invalid or fuzzy datetime values in TIMESTAMP */
-  have_smth_to_conv= (str_to_datetime(cs, from, len, &l_time,
-                                      (thd->variables.sql_mode &
-                                       MODE_NO_ZERO_DATE) |
-                                      MODE_NO_ZERO_IN_DATE, &error) >
-                      MYSQL_TIMESTAMP_ERROR);
+  uint error = 0;
+  my_time_t timestamp;
 
-  if (error || !have_smth_to_conv)
+  if (was_cut || !have_smth_to_conv)
   {
     error= 1;
     set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
-                         from, len, MYSQL_TIMESTAMP_DATETIME, 1);
+                         str, MYSQL_TIMESTAMP_DATETIME, 1);
   }
-
   /* Only convert a correct date (not a zero date) */
-  if (have_smth_to_conv && l_time.month)
+  if (have_smth_to_conv && l_time->month)
   {
-    if (!(tmp= TIME_to_timestamp(thd, &l_time, &in_dst_time_gap)))
-    {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                           ER_WARN_DATA_OUT_OF_RANGE,
-                           from, len, MYSQL_TIMESTAMP_DATETIME, !error);
-      error= 1;
-    }
-    else if (in_dst_time_gap)
+    uint conversion_error;
+    timestamp= TIME_to_timestamp(thd, l_time, &conversion_error);
+    if (timestamp == 0 && l_time->second_part == 0)
+      conversion_error= ER_WARN_DATA_OUT_OF_RANGE;
+    if (conversion_error)
     {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                           ER_WARN_INVALID_TIMESTAMP,
-                           from, len, MYSQL_TIMESTAMP_DATETIME, !error);
+      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, conversion_error,
+                           str, MYSQL_TIMESTAMP_DATETIME, !error);
       error= 1;
     }
   }
-  store_timestamp(tmp);
+  else
+  {
+    timestamp= 0;
+    l_time->second_part= 0;
+  }
+  store_TIME(timestamp, l_time->second_part);
   return error;
 }
 
 
-int Field_timestamp::store(double nr)
+int Field_timestamp::store_time_dec(MYSQL_TIME *ltime, uint dec)
 {
-  int error= 0;
-  if (nr < 0 || nr > 99991231235959.0)
-  {
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         ER_WARN_DATA_OUT_OF_RANGE,
-                         nr, MYSQL_TIMESTAMP_DATETIME);
-    nr= 0;					// Avoid overflow on buff
-    error= 1;
-  }
-  error|= Field_timestamp::store((longlong) rint(nr), FALSE);
-  return error;
+  THD *thd= table->in_use;
+  int unused;
+  MYSQL_TIME l_time= *ltime;
+  ErrConvTime str(ltime);
+  bool valid= !check_date(&l_time, pack_time(&l_time) != 0,
+                          (thd->variables.sql_mode & MODE_NO_ZERO_DATE) |
+                                       MODE_NO_ZERO_IN_DATE, &unused);
+
+  return store_TIME_with_warning(thd, &l_time, &str, false, valid);
 }
 
 
-int Field_timestamp::store(longlong nr, bool unsigned_val)
+int Field_timestamp::store(const char *from,uint len,CHARSET_INFO *cs)
 {
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
   MYSQL_TIME l_time;
-  my_time_t timestamp= 0;
   int error;
-  my_bool in_dst_time_gap;
-  THD *thd= table ? table->in_use : current_thd;
+  int have_smth_to_conv;
+  ErrConvString str(from, len, cs);
+  THD *thd= table->in_use;
 
   /* We don't want to store invalid or fuzzy datetime values in TIMESTAMP */
-  longlong tmp= number_to_datetime(nr, &l_time, (thd->variables.sql_mode &
+  have_smth_to_conv= (str_to_datetime(cs, from, len, &l_time,
+                                      (thd->variables.sql_mode &
+                                       MODE_NO_ZERO_DATE) |
+                                       MODE_NO_ZERO_IN_DATE, &error) >
+                      MYSQL_TIMESTAMP_ERROR);
+  return store_TIME_with_warning(thd, &l_time, &str, error, have_smth_to_conv);
+}
+
+
+int Field_timestamp::store(double nr)
+{
+  MYSQL_TIME l_time;
+  int error;
+  ErrConvDouble str(nr);
+  THD *thd= table->in_use;
+
+  longlong tmp= double_to_datetime(nr, &l_time, (thd->variables.sql_mode &
                                                  MODE_NO_ZERO_DATE) |
                                    MODE_NO_ZERO_IN_DATE, &error);
-  if (tmp == LL(-1))
-  {
-    error= 2;
-  }
+  return store_TIME_with_warning(thd, &l_time, &str, error, tmp != -1);
+}
 
-  if (!error && tmp)
-  {
-    if (!(timestamp= TIME_to_timestamp(thd, &l_time, &in_dst_time_gap)))
-    {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                           ER_WARN_DATA_OUT_OF_RANGE,
-                           nr, MYSQL_TIMESTAMP_DATETIME, 1);
-      error= 1;
-    }
-    if (in_dst_time_gap)
-    {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                           ER_WARN_INVALID_TIMESTAMP,
-                           nr, MYSQL_TIMESTAMP_DATETIME, 1);
-      error= 1;
-    }
-  } else if (error)
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         WARN_DATA_TRUNCATED,
-                         nr, MYSQL_TIMESTAMP_DATETIME, 1);
 
-  store_timestamp(timestamp);
-  return error;
+int Field_timestamp::store(longlong nr, bool unsigned_val)
+{
+  MYSQL_TIME l_time;
+  int error;
+  ErrConvInteger str(nr);
+  THD *thd= table->in_use;
+
+  /* We don't want to store invalid or fuzzy datetime values in TIMESTAMP */
+  longlong tmp= number_to_datetime(nr, 0, &l_time, (thd->variables.sql_mode &
+                                                 MODE_NO_ZERO_DATE) |
+                                   MODE_NO_ZERO_IN_DATE, &error);
+  return store_TIME_with_warning(thd, &l_time, &str, error, tmp != LL(-1));
 }
 
+
 double Field_timestamp::val_real(void)
 {
-  ASSERT_COLUMN_MARKED_FOR_READ;
   return (double) Field_timestamp::val_int();
 }
 
+
 longlong Field_timestamp::val_int(void)
 {
-  ASSERT_COLUMN_MARKED_FOR_READ;
-  uint32 temp;
-  MYSQL_TIME time_tmp;
-  THD  *thd= table ? table->in_use : current_thd;
-
-  thd->time_zone_used= 1;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    temp=uint4korr(ptr);
-  else
-#endif
-    longget(temp,ptr);
+  MYSQL_TIME ltime;
+  if (get_date(&ltime, TIME_NO_ZERO_DATE))
+    return 0;
 
-  if (temp == 0L)				// No time
-    return(0);					/* purecov: inspected */
-  
-  thd->variables.time_zone->gmt_sec_to_TIME(&time_tmp, (my_time_t)temp);
-  
-  return time_tmp.year * LL(10000000000) + time_tmp.month * LL(100000000) +
-         time_tmp.day * 1000000L + time_tmp.hour * 10000L +
-         time_tmp.minute * 100 + time_tmp.second;
+  return ltime.year * 10000000000LL + ltime.month * 100000000LL +
+         ltime.day * 1000000L + ltime.hour * 10000L +
+         ltime.minute * 100 + ltime.second;
 }
 
 
 String *Field_timestamp::val_str(String *val_buffer, String *val_ptr)
 {
-  ASSERT_COLUMN_MARKED_FOR_READ;
+  MYSQL_TIME ltime;
   uint32 temp, temp2;
-  MYSQL_TIME time_tmp;
-  THD *thd= table ? table->in_use : current_thd;
   char *to;
 
   val_buffer->alloc(field_length+1);
   to= (char*) val_buffer->ptr();
   val_buffer->length(field_length);
 
-  thd->time_zone_used= 1;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    temp=uint4korr(ptr);
-  else
-#endif
-    longget(temp,ptr);
-
-  if (temp == 0L)
+  if (get_date(&ltime, TIME_NO_ZERO_DATE))
   {				      /* Zero time is "000000" */
-    val_ptr->set(STRING_WITH_LEN("0000-00-00 00:00:00"), &my_charset_numeric);
+    val_ptr->set(zero_timestamp, field_length, &my_charset_numeric);
     return val_ptr;
   }
   val_buffer->set_charset(&my_charset_numeric);	// Safety
-  
-  thd->variables.time_zone->gmt_sec_to_TIME(&time_tmp,(my_time_t)temp);
-
-  temp= time_tmp.year % 100;
+   
+  temp= ltime.year % 100;
   if (temp < YY_PART_YEAR - 1)
   {
     *to++= '2';
@@ -4924,27 +4607,27 @@ String *Field_timestamp::val_str(String *val_buffer, String *val_ptr)
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
   *to++= '-';
-  temp=time_tmp.month;
+  temp=ltime.month;
   temp2=temp/10; temp=temp-temp2*10;
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
   *to++= '-';
-  temp=time_tmp.day;
+  temp=ltime.day;
   temp2=temp/10; temp=temp-temp2*10;
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
   *to++= ' ';
-  temp=time_tmp.hour;
+  temp=ltime.hour;
   temp2=temp/10; temp=temp-temp2*10;
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
   *to++= ':';
-  temp=time_tmp.minute;
+  temp=ltime.minute;
   temp2=temp/10; temp=temp-temp2*10;
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
   *to++= ':';
-  temp=time_tmp.second;
+  temp=ltime.second;
   temp2=temp/10; temp=temp-temp2*10;
   *to++= (char) ('0'+(char) (temp2));
   *to++= (char) ('0'+(char) (temp));
@@ -4956,16 +4639,11 @@ String *Field_timestamp::val_str(String *val_buffer, String *val_ptr)
 
 bool Field_timestamp::get_date(MYSQL_TIME *ltime, uint fuzzydate)
 {
-  long temp;
-  THD *thd= table ? table->in_use : current_thd;
+  THD *thd= table->in_use;
   thd->time_zone_used= 1;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    temp=uint4korr(ptr);
-  else
-#endif
-    longget(temp,ptr);
-  if (temp == 0L)
+  ulong sec_part;
+  my_time_t temp= get_timestamp(&sec_part);
+  if (temp == 0 && sec_part == 0)
   {				      /* Zero time is "000000" */
     if (fuzzydate & TIME_NO_ZERO_DATE)
       return 1;
@@ -4974,61 +4652,35 @@ bool Field_timestamp::get_date(MYSQL_TIME *ltime, uint fuzzydate)
   else
   {
     thd->variables.time_zone->gmt_sec_to_TIME(ltime, (my_time_t)temp);
+    ltime->second_part= sec_part;
   }
   return 0;
 }
 
-bool Field_timestamp::get_time(MYSQL_TIME *ltime)
-{
-  return Field_timestamp::get_date(ltime,0);
-}
-
 
 bool Field_timestamp::send_binary(Protocol *protocol)
 {
-  MYSQL_TIME tm;
-  Field_timestamp::get_date(&tm, 0);
-  return protocol->store(&tm);
+  MYSQL_TIME ltime;
+  Field_timestamp::get_date(&ltime, 0);
+  return protocol->store(&ltime, 0);
 }
 
 
 int Field_timestamp::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   int32 a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    a=sint4korr(a_ptr);
-    b=sint4korr(b_ptr);
-  }
-  else
-#endif
-  {
-  longget(a,a_ptr);
-  longget(b,b_ptr);
-  }
+  a=sint4korr(a_ptr);
+  b=sint4korr(b_ptr);
   return ((uint32) a < (uint32) b) ? -1 : ((uint32) a > (uint32) b) ? 1 : 0;
 }
 
 
 void Field_timestamp::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table || !table->s->db_low_byte_first)
-  {
-    to[0] = ptr[0];
-    to[1] = ptr[1];
-    to[2] = ptr[2];
-    to[3] = ptr[3];
-  }
-  else
-#endif
-  {
-    to[0] = ptr[3];
-    to[1] = ptr[2];
-    to[2] = ptr[1];
-    to[3] = ptr[0];
-  }
+  to[0] = ptr[3];
+  to[1] = ptr[2];
+  to[2] = ptr[1];
+  to[3] = ptr[0];
 }
 
 
@@ -5038,149 +4690,442 @@ void Field_timestamp::sql_type(String &res) const
 }
 
 
-void Field_timestamp::set_time()
+int Field_timestamp::set_time()
 {
-  THD *thd= table ? table->in_use : current_thd;
-  long tmp= (long) thd->query_start();
+  THD *thd= table->in_use;
   set_notnull();
-  store_timestamp(tmp);
+  store_TIME(thd->query_start(), 0);
+  return 0;
 }
 
-/****************************************************************************
-** time type
-** In string context: HH:MM:SS
-** In number context: HHMMSS
-** Stored as a 3 byte unsigned int
-****************************************************************************/
+void Field_timestamp_hires::sql_type(String &res) const
+{
+  CHARSET_INFO *cs=res.charset();
+  res.length(cs->cset->snprintf(cs, (char*) res.ptr(), res.alloced_length(),
+                                "timestamp(%u)", dec));
+}
 
-int Field_time::store(const char *from,uint len,CHARSET_INFO *cs)
+#ifdef NOT_USED
+static void store_native(ulonglong num, uchar *to, uint bytes)
+{
+  switch(bytes) {
+  case 1: *to= (uchar)num;              break;
+  case 2: shortstore(to, (ushort)num);  break;
+  case 3: int3store(to, num); /* Sic!*/ break;
+  case 4: longstore(to, (ulong)num);    break;
+  case 8: longlongstore(to, num);       break;
+  default: DBUG_ASSERT(0);
+  }
+}
+
+static longlong read_native(const uchar *from, uint bytes)
+{
+  switch(bytes) {
+  case 1: return from[0];
+  case 2: { uint16 tmp; shortget(tmp, from); return tmp; }
+  case 3: return uint3korr(from);
+  case 4: { uint32 tmp; longget(tmp, from); return tmp; }
+  case 8: { longlong tmp; longlongget(tmp, from); return tmp; }
+  default: DBUG_ASSERT(0); return 0;
+  }
+}
+#endif
+
+static void store_lowendian(ulonglong num, uchar *to, uint bytes)
+{
+  switch(bytes) {
+  case 1: *to= (uchar)num;    break;
+  case 2: int2store(to, num); break;
+  case 3: int3store(to, num); break;
+  case 4: int4store(to, num); break;
+  case 8: int8store(to, num); break;
+  default: DBUG_ASSERT(0);
+  }
+}
+
+static longlong read_lowendian(const uchar *from, uint bytes)
+{
+  switch(bytes) {
+  case 1: return from[0];
+  case 2: return uint2korr(from);
+  case 3: return uint3korr(from);
+  case 4: return uint4korr(from);
+  case 8: return sint8korr(from);
+  default: DBUG_ASSERT(0); return 0;
+  }
+}
+
+static void store_bigendian(ulonglong num, uchar *to, uint bytes)
+{
+  switch(bytes) {
+  case 1: mi_int1store(to, num); break;
+  case 2: mi_int2store(to, num); break;
+  case 3: mi_int3store(to, num); break;
+  case 4: mi_int4store(to, num); break;
+  case 5: mi_int5store(to, num); break;
+  case 6: mi_int6store(to, num); break;
+  case 7: mi_int7store(to, num); break;
+  case 8: mi_int8store(to, num); break;
+  default: DBUG_ASSERT(0);
+  }
+}
+
+static longlong read_bigendian(const uchar *from, uint bytes)
+{
+  switch(bytes) {
+  case 1: return mi_uint1korr(from);
+  case 2: return mi_uint2korr(from);
+  case 3: return mi_uint3korr(from);
+  case 4: return mi_uint4korr(from);
+  case 5: return mi_uint5korr(from);
+  case 6: return mi_uint6korr(from);
+  case 7: return mi_uint7korr(from);
+  case 8: return mi_sint8korr(from);
+  default: DBUG_ASSERT(0); return 0;
+  }
+}
+
+void Field_timestamp_hires::store_TIME(my_time_t timestamp, ulong sec_part)
+{
+  mi_int4store(ptr, timestamp);
+  store_bigendian(sec_part_shift(sec_part, dec), ptr+4, sec_part_bytes[dec]);
+}
+
+my_time_t Field_timestamp_hires::get_timestamp(ulong *sec_part) const
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  *sec_part= (long)sec_part_unshift(read_bigendian(ptr+4, sec_part_bytes[dec]), dec);
+  return mi_uint4korr(ptr);
+}
+
+double Field_timestamp_hires::val_real(void)
 {
   MYSQL_TIME ltime;
-  long tmp;
-  int error= 0;
-  int warning;
+  if (get_date(&ltime, TIME_NO_ZERO_DATE))
+    return 0;
+  
+  return ltime.year * 1e10 + ltime.month * 1e8 +
+         ltime.day * 1e6 + ltime.hour * 1e4 +
+         ltime.minute * 1e2 + ltime.second + ltime.second_part*1e-6;
+}
+
+String *Field_timestamp_hires::val_str(String *val_buffer, String *val_ptr)
+{
+  String *tmp= Field_timestamp::val_str(val_buffer, val_ptr);
+  ulong sec_part= (ulong)read_bigendian(ptr+4, sec_part_bytes[dec]);
+  
+  if (tmp->ptr() == zero_timestamp)
+    return tmp;
 
-  if (str_to_time(cs, from, len, &ltime, &warning))
+  char *buf= const_cast<char*>(tmp->ptr() + MAX_DATETIME_WIDTH);
+  for (int i=dec; i>0; i--, sec_part/=10)
+    buf[i]= (char)(sec_part % 10) + '0';
+  buf[0]= '.';
+  buf[dec+1]= 0;
+  return tmp;
+}
+
+
+my_decimal *Field_timestamp_hires::val_decimal(my_decimal *d)
+{
+  MYSQL_TIME ltime;
+  get_date(&ltime, 0);
+  longlong intg= TIME_to_ulonglong(&ltime);
+  return seconds2my_decimal(ltime.neg, intg, ltime.second_part, d);
+}
+ 
+int Field_timestamp_hires::store_decimal(const my_decimal *d)
+{
+  ulonglong nr;
+  ulong sec_part;
+  int error;
+  MYSQL_TIME ltime;
+  longlong tmp;
+  THD *thd= table->in_use;
+  ErrConvDecimal str(d);
+
+  if (my_decimal2seconds(d, &nr, &sec_part))
   {
-    tmp=0L;
+    tmp= -1;
     error= 2;
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
-                         from, len, MYSQL_TIMESTAMP_TIME, 1);
   }
   else
-  {
-    if (warning & MYSQL_TIME_WARN_TRUNCATED)
-    {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                           WARN_DATA_TRUNCATED,
-                           from, len, MYSQL_TIMESTAMP_TIME, 1);
-      error= 1;
-    }
-    if (warning & MYSQL_TIME_WARN_OUT_OF_RANGE)
-    {
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                           ER_WARN_DATA_OUT_OF_RANGE,
-                           from, len, MYSQL_TIMESTAMP_TIME, !error);
-      error= 1;
-    }
-    if (ltime.month)
-      ltime.day=0;
-    tmp=(ltime.day*24L+ltime.hour)*10000L+(ltime.minute*100+ltime.second);
-  }
-  
-  if (ltime.neg)
-    tmp= -tmp;
-  int3store(ptr,tmp);
-  return error;
+    tmp= number_to_datetime(nr, sec_part, &ltime, TIME_NO_ZERO_IN_DATE |
+                                                  (thd->variables.sql_mode &
+                                                   MODE_NO_ZERO_DATE), &error);
+
+  return store_TIME_with_warning(thd, &ltime, &str, error, tmp != -1);
 }
 
+int Field_timestamp_hires::set_time()
+{
+  THD *thd= table->in_use;
+  set_notnull();
+  store_TIME(thd->query_start(), thd->query_start_sec_part());
+  return 0;
+}
 
-int Field_time::store_time(MYSQL_TIME *ltime, timestamp_type time_type)
+bool Field_timestamp_hires::send_binary(Protocol *protocol)
 {
-  long tmp= ((ltime->month ? 0 : ltime->day * 24L) + ltime->hour) * 10000L +
-            (ltime->minute * 100 + ltime->second);
-  if (ltime->neg)
-    tmp= -tmp;
-  return Field_time::store((longlong) tmp, FALSE);
+  MYSQL_TIME ltime;
+  Field_timestamp::get_date(&ltime, 0);
+  return protocol->store(&ltime, dec);
 }
 
 
-int Field_time::store(double nr)
+int Field_timestamp_hires::cmp(const uchar *a_ptr, const uchar *b_ptr)
+{
+  int32 a,b;
+  ulong a_sec_part, b_sec_part;
+  a= mi_uint4korr(a_ptr);
+  a_sec_part= (ulong)read_bigendian(a_ptr+4, sec_part_bytes[dec]);
+  b= mi_uint4korr(b_ptr);
+  b_sec_part= (ulong)read_bigendian(b_ptr+4, sec_part_bytes[dec]);
+  return ((uint32) a < (uint32) b) ? -1 : ((uint32) a > (uint32) b) ? 1 :
+          a_sec_part < b_sec_part  ? -1 :  a_sec_part > b_sec_part  ? 1 : 0;
+}
+
+
+void Field_timestamp_hires::sort_string(uchar *to,uint length)
+{
+  DBUG_ASSERT(length == Field_timestamp_hires::pack_length());
+  memcpy(to, ptr, length);
+}
+
+uint32 Field_timestamp_hires::pack_length() const
+{
+  return 4 + sec_part_bytes[dec];
+}
+
+void Field_timestamp_hires::make_field(Send_field *field)
+{
+  Field::make_field(field);
+  field->decimals= dec;
+}
+
+/*
+  Store string into a date/time field
+
+  RETURN
+    0  ok
+    1  Value was cut during conversion
+    2  value was out of range
+    3  Datetime value that was cut (warning level NOTE)
+       This is used by opt_range.cc:get_mm_leaf().
+*/
+int Field_temporal::store_TIME_with_warning(MYSQL_TIME *ltime,
+                                            const ErrConv *str,
+                                            int was_cut, int have_smth_to_conv)
 {
+  MYSQL_ERROR::enum_warning_level trunc_level= MYSQL_ERROR::WARN_LEVEL_WARN;
+  int ret= 2;
+  
   ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  long tmp;
-  int error= 0;
-  if (nr > (double)TIME_MAX_VALUE)
+
+  if (was_cut == 0 &&
+      have_smth_to_conv == 0 &&
+      mysql_type_to_time_type(type()) != MYSQL_TIMESTAMP_TIME) // special case: zero date
+    was_cut= MYSQL_TIME_WARN_OUT_OF_RANGE;
+  else
+  if (!have_smth_to_conv)
   {
-    tmp= TIME_MAX_VALUE;
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         ER_WARN_DATA_OUT_OF_RANGE, nr, MYSQL_TIMESTAMP_TIME);
-    error= 1;
+    bzero(ltime, sizeof(*ltime));
+    was_cut=  MYSQL_TIME_WARN_TRUNCATED;
+    ret= 1;
   }
-  else if (nr < (double)-TIME_MAX_VALUE)
+  else if (!(was_cut & MYSQL_TIME_WARN_TRUNCATED) &&
+           mysql_type_to_time_type(type()) == MYSQL_TIMESTAMP_DATE &&
+           (ltime->hour || ltime->minute || ltime->second || ltime->second_part))
   {
-    tmp= -TIME_MAX_VALUE;
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                         ER_WARN_DATA_OUT_OF_RANGE, nr, MYSQL_TIMESTAMP_TIME);
-    error= 1;
+    trunc_level= MYSQL_ERROR::WARN_LEVEL_NOTE;
+    was_cut|=  MYSQL_TIME_WARN_TRUNCATED;
+    ret= 3;
   }
-  else
+  else if (!(was_cut & MYSQL_TIME_WARN_TRUNCATED) &&
+           mysql_type_to_time_type(type()) == MYSQL_TIMESTAMP_TIME &&
+           (ltime->year || ltime->month))
   {
-    tmp=(long) floor(fabs(nr));			// Remove fractions
-    if (nr < 0)
-      tmp= -tmp;
-    if (tmp % 100 > 59 || tmp/100 % 100 > 59)
-    {
-      tmp=0;
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                           ER_WARN_DATA_OUT_OF_RANGE, nr,
-                           MYSQL_TIMESTAMP_TIME);
-      error= 1;
-    }
+    ltime->year= ltime->month= ltime->day= 0;
+    trunc_level= MYSQL_ERROR::WARN_LEVEL_NOTE;
+    was_cut|=  MYSQL_TIME_WARN_TRUNCATED;
+    ret= 3;
   }
-  int3store(ptr,tmp);
-  return error;
+
+  /*
+    error code logic:
+    MYSQL_TIME_WARN_TRUNCATED means that the value was not a date/time at all.
+      it will be stored as zero date/time.
+    MYSQL_TIME_WARN_OUT_OF_RANGE means that the value was a date/time,
+      that is, it was parsed as such, but the value was invalid.
+
+    Also, MYSQL_TIME_WARN_TRUNCATED is used when storing a DATETIME in
+    a DATE field and non-zero time part is thrown away.
+  */
+  if (was_cut & MYSQL_TIME_WARN_TRUNCATED)
+    set_datetime_warning(trunc_level, WARN_DATA_TRUNCATED,
+                         str, mysql_type_to_time_type(type()), 1);
+  if (was_cut & MYSQL_TIME_WARN_OUT_OF_RANGE)
+    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_DATA_OUT_OF_RANGE,
+                         str, mysql_type_to_time_type(type()), 1);
+
+  store_TIME(ltime);
+  return was_cut ? ret : 0;
 }
 
 
-int Field_time::store(longlong nr, bool unsigned_val)
+int Field_temporal::store(const char *from,uint len,CHARSET_INFO *cs)
+{
+  MYSQL_TIME ltime;
+  int error;
+  enum enum_mysql_timestamp_type func_res;
+  THD *thd= table->in_use;
+  ErrConvString str(from, len, cs);
+
+  func_res= str_to_datetime(cs, from, len, &ltime,
+                            (TIME_FUZZY_DATE |
+                             (thd->variables.sql_mode &
+                              (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
+                               MODE_INVALID_DATES))),
+                            &error);
+  return store_TIME_with_warning(&ltime, &str, error, func_res > MYSQL_TIMESTAMP_ERROR);
+}
+
+
+int Field_temporal::store(double nr)
 {
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  long tmp;
   int error= 0;
-  if (nr < (longlong) -TIME_MAX_VALUE && !unsigned_val)
-  {
-    tmp= -TIME_MAX_VALUE;
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                         ER_WARN_DATA_OUT_OF_RANGE, nr,
-                         MYSQL_TIMESTAMP_TIME, 1);
-    error= 1;
-  }
-  else if (nr > (longlong) TIME_MAX_VALUE || (nr < 0 && unsigned_val))
-  {
-    tmp= TIME_MAX_VALUE;
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                         ER_WARN_DATA_OUT_OF_RANGE, nr,
-                         MYSQL_TIMESTAMP_TIME, 1);
-    error= 1;
-  }
-  else
+  MYSQL_TIME ltime;
+  THD *thd= table->in_use;
+  ErrConvDouble str(nr);
+
+  longlong tmp= double_to_datetime(nr, &ltime,
+                                    (TIME_FUZZY_DATE |
+                                       (thd->variables.sql_mode &
+                                        (MODE_NO_ZERO_IN_DATE |
+                                         MODE_NO_ZERO_DATE |
+                                         MODE_INVALID_DATES))), &error);
+  return store_TIME_with_warning(&ltime, &str, error, tmp != -1);
+}
+
+
+int Field_temporal::store(longlong nr, bool unsigned_val)
+{
+  int error;
+  MYSQL_TIME ltime;
+  longlong tmp;
+  THD *thd= table->in_use;
+  ErrConvInteger str(nr);
+
+  tmp= number_to_datetime(nr, 0, &ltime, (TIME_FUZZY_DATE |
+                                      (thd->variables.sql_mode &
+                                       (MODE_NO_ZERO_IN_DATE |
+                                        MODE_NO_ZERO_DATE |
+                                        MODE_INVALID_DATES))), &error);
+
+  return store_TIME_with_warning(&ltime, &str, error, tmp != -1);
+}
+
+
+int Field_temporal::store_time_dec(MYSQL_TIME *ltime, uint dec)
+{
+  int error = 0, have_smth_to_conv= 1;
+  MYSQL_TIME l_time= *ltime;
+  ErrConvTime str(ltime);
+  /*
+    We don't perform range checking here since values stored in TIME
+    structure always fit into DATETIME range.
+  */
+  have_smth_to_conv= !check_date(&l_time, pack_time(&l_time) != 0,
+                                 (TIME_FUZZY_DATE |
+                                  (current_thd->variables.sql_mode &
+                                   (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
+                                    MODE_INVALID_DATES))), &error);
+  return store_TIME_with_warning(&l_time, &str, error, have_smth_to_conv);
+}
+
+my_decimal *Field_temporal::val_decimal(my_decimal *d)
+{
+  MYSQL_TIME ltime;
+  if (get_date(&ltime, TIME_FUZZY_DATE))
   {
-    tmp=(long) nr;
-    if (tmp % 100 > 59 || tmp/100 % 100 > 59)
-    {
-      tmp=0;
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                           ER_WARN_DATA_OUT_OF_RANGE, nr,
-                           MYSQL_TIMESTAMP_TIME, 1);
-      error= 1;
-    }
+    bzero(&ltime, sizeof(ltime));
+    ltime.time_type= mysql_type_to_time_type(type());
   }
+  longlong intg= TIME_to_ulonglong(&ltime);
+  return seconds2my_decimal(ltime.neg, intg, ltime.second_part, d);
+}
+
+/****************************************************************************
+** time type
+** In string context: HH:MM:SS
+** In number context: HHMMSS
+** Stored as a 3 byte unsigned int
+****************************************************************************/
+
+void Field_time::store_TIME(MYSQL_TIME *ltime)
+{
+  long tmp= (ltime->day*24L+ltime->hour)*10000L +
+            (ltime->minute*100+ltime->second);
+  if (ltime->neg)
+    tmp= -tmp;
   int3store(ptr,tmp);
-  return error;
+}
+
+int Field_time::store(const char *from,uint len,CHARSET_INFO *cs)
+{
+  MYSQL_TIME ltime;
+  ErrConvString str(from, len, cs);
+  int was_cut;
+  int have_smth_to_conv=
+    str_to_time(cs, from, len, &ltime,
+                table->in_use->variables.sql_mode &
+                (MODE_NO_ZERO_DATE | MODE_NO_ZERO_IN_DATE |
+                 MODE_INVALID_DATES),
+                &was_cut) > MYSQL_TIMESTAMP_ERROR;
+
+  return store_TIME_with_warning(&ltime, &str, was_cut, have_smth_to_conv);
+}
+
+
+int Field_time::store_time_dec(MYSQL_TIME *ltime, uint dec)
+{
+  MYSQL_TIME l_time= *ltime;
+  ErrConvTime str(ltime);
+  int was_cut= 0;
+
+  int have_smth_to_conv= !check_time_range(&l_time, decimals(), &was_cut);
+  return store_TIME_with_warning(&l_time, &str, was_cut, have_smth_to_conv);
+}
+
+
+int Field_time::store(double nr)
+{
+  MYSQL_TIME ltime;
+  ErrConvDouble str(nr);
+  int was_cut;
+  bool neg= nr < 0;
+  if (neg)
+    nr= -nr;
+  int have_smth_to_conv= !number_to_time(neg, (longlong)nr,
+                                         (ulong)((nr - floor(nr)) * TIME_SECOND_PART_FACTOR),
+                                         &ltime, &was_cut);
+
+  return store_TIME_with_warning(&ltime, &str, was_cut, have_smth_to_conv);
 }
 
 
+int Field_time::store(longlong nr, bool unsigned_val)
+{
+  MYSQL_TIME ltime;
+  ErrConvInteger str(nr);
+  int was_cut;
+  int have_smth_to_conv= !number_to_time(nr < 0, nr < 0 ? -nr : nr,
+                                         0, &ltime, &was_cut);
+
+  return store_TIME_with_warning(&ltime, &str, was_cut, have_smth_to_conv);
+}
+  
+  
 double Field_time::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
@@ -5206,7 +5151,6 @@ String *Field_time::val_str(String *val_buffer,
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   MYSQL_TIME ltime;
-  val_buffer->alloc(MAX_DATE_STRING_REP_LENGTH);
   long tmp=(long) sint3korr(ptr);
   ltime.neg= 0;
   if (tmp < 0)
@@ -5214,12 +5158,19 @@ String *Field_time::val_str(String *val_buffer,
     tmp= -tmp;
     ltime.neg= 1;
   }
+  ltime.year= ltime.month= 0;
   ltime.day= (uint) 0;
   ltime.hour= (uint) (tmp/10000);
   ltime.minute= (uint) (tmp/100 % 100);
   ltime.second= (uint) (tmp % 100);
-  make_time((DATE_TIME_FORMAT*) 0, &ltime, val_buffer);
+  ltime.second_part= 0;
+
+  val_buffer->alloc(MAX_DATE_STRING_REP_LENGTH);
+  uint length= (uint) my_time_to_str(&ltime,
+                                     const_cast<char*>(val_buffer->ptr()), 0);
+  val_buffer->length(length);
   val_buffer->set_charset(&my_charset_numeric);
+
   return val_buffer;
 }
 
@@ -5233,8 +5184,8 @@ String *Field_time::val_str(String *val_buffer,
  
 bool Field_time::get_date(MYSQL_TIME *ltime, uint fuzzydate)
 {
-  THD *thd= table ? table->in_use : current_thd;
-  if (!(fuzzydate & TIME_FUZZY_DATE))
+  THD *thd= table->in_use;
+  if (!(fuzzydate & (TIME_FUZZY_DATE|TIME_TIME_ONLY)))
   {
     push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                         ER_WARN_DATA_OUT_OF_RANGE,
@@ -5242,12 +5193,6 @@ bool Field_time::get_date(MYSQL_TIME *ltime, uint fuzzydate)
                         thd->warning_info->current_row_for_warning());
     return 1;
   }
-  return Field_time::get_time(ltime);
-}
-
-
-bool Field_time::get_time(MYSQL_TIME *ltime)
-{
   long tmp=(long) sint3korr(ptr);
   ltime->neg=0;
   if (tmp < 0)
@@ -5268,11 +5213,9 @@ bool Field_time::get_time(MYSQL_TIME *ltime)
 
 bool Field_time::send_binary(Protocol *protocol)
 {
-  MYSQL_TIME tm;
-  Field_time::get_time(&tm);
-  tm.day= tm.hour/24;				// Move hours to days
-  tm.hour-= tm.day*24;
-  return protocol->store_time(&tm);
+  MYSQL_TIME ltime;
+  Field_time::get_date(&ltime, TIME_TIME_ONLY);
+  return protocol->store_time(&ltime, 0);
 }
 
 
@@ -5296,6 +5239,121 @@ void Field_time::sql_type(String &res) const
   res.set_ascii(STRING_WITH_LEN("time"));
 }
 
+int Field_time_hires::reset()
+{
+  store_bigendian(zero_point, ptr, Field_time_hires::pack_length());
+  return 0;
+}
+
+
+void Field_time_hires::store_TIME(MYSQL_TIME *ltime)
+{
+  ulonglong packed= sec_part_shift(pack_time(ltime), dec) + zero_point;
+  store_bigendian(packed, ptr, Field_time_hires::pack_length());
+}
+
+int Field_time_hires::store_decimal(const my_decimal *d)
+{
+  ulonglong nr;
+  ulong sec_part;
+  ErrConvDecimal str(d);
+  MYSQL_TIME ltime;
+  int was_cut;
+  bool neg= my_decimal2seconds(d, &nr, &sec_part);
+
+  int have_smth_to_conv= !number_to_time(neg, nr, sec_part, &ltime, &was_cut);
+
+  return store_TIME_with_warning(&ltime, &str, was_cut, have_smth_to_conv);
+}
+
+uint32 Field_time_hires::pack_length() const
+{
+  return time_hires_bytes[dec];
+}
+
+longlong Field_time_hires::val_int(void)
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  MYSQL_TIME ltime;
+  Field_time_hires::get_date(&ltime, TIME_TIME_ONLY);
+  longlong val= TIME_to_ulonglong_time(&ltime);
+  return ltime.neg ? -val : val;
+}
+
+double Field_time_hires::val_real(void)
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  MYSQL_TIME ltime;
+  Field_time_hires::get_date(&ltime, TIME_TIME_ONLY);
+  return TIME_to_double(&ltime);
+}
+
+String *Field_time_hires::val_str(String *str,
+                                  String *unused __attribute__((unused)))
+{
+  ASSERT_COLUMN_MARKED_FOR_READ;
+  MYSQL_TIME ltime;
+  Field_time_hires::get_date(&ltime, TIME_TIME_ONLY);
+  str->alloc(field_length+1);
+  str->length(my_time_to_str(&ltime, (char*) str->ptr(), dec));
+  str->set_charset(&my_charset_bin);
+  return str;
+}
+
+bool Field_time_hires::get_date(MYSQL_TIME *ltime, uint fuzzydate)
+{
+  uint32 len= pack_length();
+  longlong packed= read_bigendian(ptr, len);
+
+  packed= sec_part_unshift(packed - zero_point, dec);
+
+  unpack_time(packed, ltime);
+  /*
+    unpack_time() returns MYSQL_TIMESTAMP_DATETIME.
+    To get MYSQL_TIMESTAMP_TIME we need few adjustments
+  */
+  ltime->time_type= MYSQL_TIMESTAMP_TIME;
+  ltime->hour+= (ltime->month*32+ltime->day)*24;
+  ltime->month= ltime->day= 0;
+  return fuzzydate & (TIME_FUZZY_DATE | TIME_TIME_ONLY) ? 0 : 1;
+}
+
+
+bool Field_time_hires::send_binary(Protocol *protocol)
+{
+  MYSQL_TIME ltime;
+  Field_time_hires::get_date(&ltime, TIME_TIME_ONLY);
+  return protocol->store_time(&ltime, dec);
+}
+
+
+int Field_time_hires::cmp(const uchar *a_ptr, const uchar *b_ptr)
+{
+  ulonglong a=read_bigendian(a_ptr, Field_time_hires::pack_length());
+  ulonglong b=read_bigendian(b_ptr, Field_time_hires::pack_length());
+  return (a < b) ? -1 : (a > b) ? 1 : 0;
+}
+
+void Field_time_hires::sort_string(uchar *to,uint length __attribute__((unused)))
+{
+  DBUG_ASSERT(length == Field_time_hires::pack_length());
+  memcpy(to, ptr, length);
+  to[0]^= 128;
+}
+
+void Field_time_hires::sql_type(String &res) const
+{
+  CHARSET_INFO *cs=res.charset();
+  res.length(cs->cset->snprintf(cs, (char*) res.ptr(), res.alloced_length(),
+                                "time(%u)", dec));
+}
+
+void Field_time_hires::make_field(Send_field *field)
+{
+  Field::make_field(field);
+  field->decimals= dec;
+}
+
 /****************************************************************************
 ** year type
 ** Save in a byte the year 0, 1901->2155
@@ -5371,6 +5429,17 @@ int Field_year::store(longlong nr, bool unsigned_val)
 }
 
 
+int Field_year::store_time_dec(MYSQL_TIME *ltime, uint dec)
+{
+  ErrConvTime str(ltime);
+  if (Field_year::store(ltime->year, 0))
+    return 1;
+
+  set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
+                       &str, ltime->time_type, 1);
+  return 0;
+}
+
 bool Field_year::send_binary(Protocol *protocol)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
@@ -5411,6 +5480,15 @@ String *Field_year::val_str(String *val_buffer,
 }
 
 
+bool Field_year::get_date(MYSQL_TIME *ltime,uint fuzzydate)
+{
+  int tmp= (int) ptr[0];
+  if (tmp || field_length != 4)
+    tmp+= 1900;
+  return int_to_datetime_with_warn(tmp * 10000, ltime, fuzzydate, field_name);
+}
+
+
 void Field_year::sql_type(String &res) const
 {
   CHARSET_INFO *cs=res.charset();
@@ -5426,102 +5504,12 @@ void Field_year::sql_type(String &res) const
 ** Stored as a 4 byte unsigned int
 ****************************************************************************/
 
-int Field_date::store(const char *from, uint len,CHARSET_INFO *cs)
+void Field_date::store_TIME(MYSQL_TIME *ltime)
 {
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME l_time;
-  uint32 tmp;
-  int error;
-  THD *thd= table ? table->in_use : current_thd;
-
-  if (str_to_datetime(cs, from, len, &l_time, TIME_FUZZY_DATE |
-                      (thd->variables.sql_mode &
-                       (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                        MODE_INVALID_DATES)),
-                      &error) <= MYSQL_TIMESTAMP_ERROR)
-  {
-    tmp= 0;
-    error= 2;
-  }
-  else
-    tmp=(uint32) l_time.year*10000L + (uint32) (l_time.month*100+l_time.day);
-
-  if (error)
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
-                         from, len, MYSQL_TIMESTAMP_DATE, 1);
-
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    int4store(ptr,tmp);
-  }
-  else
-#endif
-    longstore(ptr,tmp);
-  return error;
+  uint tmp= ltime->year*10000L + ltime->month*100+ltime->day;
+  int4store(ptr,tmp);
 }
 
-
-int Field_date::store(double nr)
-{
-  longlong tmp;
-  if (nr >= 19000000000000.0 && nr <= 99991231235959.0)
-    nr=floor(nr/1000000.0);			// Timestamp to date
-  if (nr < 0.0 || nr > 99991231.0)
-  {
-    tmp= LL(0);
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         ER_WARN_DATA_OUT_OF_RANGE,
-                         nr, MYSQL_TIMESTAMP_DATE);
-  }
-  else
-    tmp= (longlong) rint(nr);
-
-  return Field_date::store(tmp, TRUE);
-}
-
-
-int Field_date::store(longlong nr, bool unsigned_val)
-{
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME not_used;
-  int error;
-  longlong initial_nr= nr;
-  THD *thd= table ? table->in_use : current_thd;
-
-  nr= number_to_datetime(nr, &not_used, (TIME_FUZZY_DATE |
-                                         (thd->variables.sql_mode &
-                                          (MODE_NO_ZERO_IN_DATE |
-                                           MODE_NO_ZERO_DATE |
-                                           MODE_INVALID_DATES))), &error);
-
-  if (nr == LL(-1))
-  {
-    nr= 0;
-    error= 2;
-  }
-
-  if (nr >= 19000000000000.0 && nr <= 99991231235959.0)
-    nr= (longlong) floor(nr/1000000.0);         // Timestamp to date
-
-  if (error)
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         error == 2 ? ER_WARN_DATA_OUT_OF_RANGE :
-                         WARN_DATA_TRUNCATED, initial_nr,
-                         MYSQL_TIMESTAMP_DATETIME, 1);
-
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    int4store(ptr, nr);
-  }
-  else
-#endif
-    longstore(ptr, nr);
-  return error;
-}
-
-
 bool Field_date::send_binary(Protocol *protocol)
 {
   longlong tmp= Field_date::val_int();
@@ -5537,12 +5525,7 @@ double Field_date::val_real(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   int32 j;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    j=sint4korr(ptr);
-  else
-#endif
-    longget(j,ptr);
+  j=sint4korr(ptr);
   return (double) (uint32) j;
 }
 
@@ -5551,12 +5534,7 @@ longlong Field_date::val_int(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   int32 j;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    j=sint4korr(ptr);
-  else
-#endif
-    longget(j,ptr);
+  j=sint4korr(ptr);
   return (longlong) (uint32) j;
 }
 
@@ -5566,68 +5544,38 @@ String *Field_date::val_str(String *val_buffer,
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   MYSQL_TIME ltime;
-  val_buffer->alloc(field_length);
   int32 tmp;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    tmp=sint4korr(ptr);
-  else
-#endif
-    longget(tmp,ptr);
+  tmp=sint4korr(ptr);
   ltime.neg= 0;
   ltime.year= (int) ((uint32) tmp/10000L % 10000);
   ltime.month= (int) ((uint32) tmp/100 % 100);
   ltime.day= (int) ((uint32) tmp % 100);
-  make_date((DATE_TIME_FORMAT *) 0, &ltime, val_buffer);
-  val_buffer->set_charset(&my_charset_numeric);
-  return val_buffer;
-}
 
+  val_buffer->alloc(MAX_DATE_STRING_REP_LENGTH);
+  uint length= (uint) my_date_to_str(&ltime,
+                                     const_cast<char*>(val_buffer->ptr()));
+  val_buffer->length(length);
+  val_buffer->set_charset(&my_charset_numeric);
 
-bool Field_date::get_time(MYSQL_TIME *ltime)
-{
-  bzero((char *)ltime, sizeof(MYSQL_TIME));
-  return 0;
+  return val_buffer;
 }
 
 
 int Field_date::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   int32 a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    a=sint4korr(a_ptr);
-    b=sint4korr(b_ptr);
-  }
-  else
-#endif
-  {
-    longget(a,a_ptr);
-    longget(b,b_ptr);
-  }
+  a=sint4korr(a_ptr);
+  b=sint4korr(b_ptr);
   return ((uint32) a < (uint32) b) ? -1 : ((uint32) a > (uint32) b) ? 1 : 0;
 }
 
 
 void Field_date::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table || !table->s->db_low_byte_first)
-  {
-    to[0] = ptr[0];
-    to[1] = ptr[1];
-    to[2] = ptr[2];
-    to[3] = ptr[3];
-  }
-  else
-#endif
-  {
-    to[0] = ptr[3];
-    to[1] = ptr[2];
-    to[2] = ptr[1];
-    to[3] = ptr[0];
-  }
+  to[0] = ptr[3];
+  to[1] = ptr[2];
+  to[2] = ptr[1];
+  to[3] = ptr[0];
 }
 
 void Field_date::sql_type(String &res) const
@@ -5642,153 +5590,10 @@ void Field_date::sql_type(String &res) const
 ** In number context: YYYYMMDD
 ****************************************************************************/
 
-/*
-  Store string into a date field
-
-  SYNOPSIS
-    Field_newdate::store()
-    from                Date string
-    len                 Length of date field
-    cs                  Character set (not used)
-
-  RETURN
-    0  ok
-    1  Value was cut during conversion
-    2  Wrong date string
-    3  Datetime value that was cut (warning level NOTE)
-       This is used by opt_range.cc:get_mm_leaf(). Note that there is a
-       nearly-identical class Field_date doesn't ever return 3 from its
-       store function.
-*/
-
-int Field_newdate::store(const char *from,uint len,CHARSET_INFO *cs)
-{
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  long tmp;
-  MYSQL_TIME l_time;
-  int error;
-  THD *thd= table ? table->in_use : current_thd;
-  enum enum_mysql_timestamp_type ret;
-  if ((ret= str_to_datetime(cs, from, len, &l_time,
-                            (TIME_FUZZY_DATE |
-                             (thd->variables.sql_mode &
-                              (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                               MODE_INVALID_DATES))),
-                            &error)) <= MYSQL_TIMESTAMP_ERROR)
-  {
-    tmp= 0;
-    error= 2;
-  }
-  else
-  {
-    tmp= l_time.day + l_time.month*32 + l_time.year*16*32;
-    if (!error && (ret != MYSQL_TIMESTAMP_DATE) &&
-        (l_time.hour || l_time.minute || l_time.second || l_time.second_part))
-      error= 3;                                 // Datetime was cut (note)
-  }
-
-  if (error)
-    set_datetime_warning(error == 3 ? MYSQL_ERROR::WARN_LEVEL_NOTE :
-                         MYSQL_ERROR::WARN_LEVEL_WARN,
-                         WARN_DATA_TRUNCATED,
-                         from, len, MYSQL_TIMESTAMP_DATE, 1);
-
-  int3store(ptr, tmp);
-  return error;
-}
-
-
-int Field_newdate::store(double nr)
-{
-  if (nr < 0.0 || nr > 99991231235959.0)
-  {
-    int3store(ptr,(int32) 0);
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         WARN_DATA_TRUNCATED, nr, MYSQL_TIMESTAMP_DATE);
-    return 1;
-  }
-  return Field_newdate::store((longlong) rint(nr), FALSE);
-}
-
-
-int Field_newdate::store(longlong nr, bool unsigned_val)
+void Field_newdate::store_TIME(MYSQL_TIME *ltime)
 {
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME l_time;
-  longlong tmp;
-  int error;
-  THD *thd= table ? table->in_use : current_thd;
-  if (number_to_datetime(nr, &l_time,
-                         (TIME_FUZZY_DATE |
-                          (thd->variables.sql_mode &
-                           (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                            MODE_INVALID_DATES))),
-                         &error) == LL(-1))
-  {
-    tmp= 0L;
-    error= 2;
-  }
-  else
-    tmp= l_time.day + l_time.month*32 + l_time.year*16*32;
-
-  if (!error && l_time.time_type != MYSQL_TIMESTAMP_DATE &&
-      (l_time.hour || l_time.minute || l_time.second || l_time.second_part))
-    error= 3;
-
-  if (error)
-    set_datetime_warning(error == 3 ? MYSQL_ERROR::WARN_LEVEL_NOTE :
-                         MYSQL_ERROR::WARN_LEVEL_WARN,
-                         error == 2 ? 
-                         ER_WARN_DATA_OUT_OF_RANGE : WARN_DATA_TRUNCATED,
-                         nr,MYSQL_TIMESTAMP_DATE, 1);
-
+  uint tmp= ltime->year*16*32 + ltime->month*32+ltime->day;
   int3store(ptr,tmp);
-  return error;
-}
-
-
-int Field_newdate::store_time(MYSQL_TIME *ltime,timestamp_type time_type)
-{
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  long tmp;
-  int error= 0;
-  if (time_type == MYSQL_TIMESTAMP_DATE ||
-      time_type == MYSQL_TIMESTAMP_DATETIME)
-  {
-    tmp=ltime->year*16*32+ltime->month*32+ltime->day;
-    if (check_date(ltime, tmp != 0,
-                   (TIME_FUZZY_DATE |
-                    (current_thd->variables.sql_mode &
-                     (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                      MODE_INVALID_DATES))), &error))
-    {
-      char buff[MAX_DATE_STRING_REP_LENGTH];
-      String str(buff, sizeof(buff), &my_charset_latin1);
-      tmp= 0;
-      make_date((DATE_TIME_FORMAT *) 0, ltime, &str);
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
-                           str.ptr(), str.length(), MYSQL_TIMESTAMP_DATE, 1);
-    }
-    if (!error && ltime->time_type != MYSQL_TIMESTAMP_DATE &&
-        (ltime->hour || ltime->minute || ltime->second || ltime->second_part))
-    {
-      char buff[MAX_DATE_STRING_REP_LENGTH];
-      String str(buff, sizeof(buff), &my_charset_latin1);
-      make_datetime((DATE_TIME_FORMAT *) 0, ltime, &str);
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_NOTE,
-                           WARN_DATA_TRUNCATED,
-                           str.ptr(), str.length(), MYSQL_TIMESTAMP_DATE, 1);
-      error= 3;
-    }
-  }
-  else
-  {
-    tmp=0;
-    error= 1;
-    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED, 1);
-  }
-  int3store(ptr,tmp);
-  return error;
 }
 
 
@@ -5854,14 +5659,11 @@ bool Field_newdate::get_date(MYSQL_TIME *ltime,uint fuzzydate)
   ltime->year=  (tmp >> 9);
   ltime->time_type= MYSQL_TIMESTAMP_DATE;
   ltime->hour= ltime->minute= ltime->second= ltime->second_part= ltime->neg= 0;
-  return ((!(fuzzydate & TIME_FUZZY_DATE) && (!ltime->month || !ltime->day)) ?
-          1 : 0);
-}
-
-
-bool Field_newdate::get_time(MYSQL_TIME *ltime)
-{
-  return Field_newdate::get_date(ltime,0);
+  if (!tmp)
+    return fuzzydate & TIME_NO_ZERO_DATE;
+  if (!ltime->month || !ltime->day)
+    return !(fuzzydate & TIME_FUZZY_DATE);
+  return 0;
 }
 
 
@@ -5895,150 +5697,20 @@ void Field_newdate::sql_type(String &res) const
 ** Stored as a 8 byte unsigned int. Should sometimes be change to a 6 byte int.
 ****************************************************************************/
 
-int Field_datetime::store(const char *from,uint len,CHARSET_INFO *cs)
-{
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME time_tmp;
-  int error;
-  ulonglong tmp= 0;
-  enum enum_mysql_timestamp_type func_res;
-  THD *thd= table ? table->in_use : current_thd;
-
-  func_res= str_to_datetime(cs, from, len, &time_tmp,
-                            (TIME_FUZZY_DATE |
-                             (thd->variables.sql_mode &
-                              (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                               MODE_INVALID_DATES))),
-                            &error);
-  if ((int) func_res > (int) MYSQL_TIMESTAMP_ERROR)
-    tmp= TIME_to_ulonglong_datetime(&time_tmp);
-  else
-    error= 1;                                 // Fix if invalid zero date
-
-  if (error)
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         ER_WARN_DATA_OUT_OF_RANGE,
-                         from, len, MYSQL_TIMESTAMP_DATETIME, 1);
-
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    int8store(ptr,tmp);
-  }
-  else
-#endif
-    longlongstore(ptr,tmp);
-  return error;
-}
-
-
-int Field_datetime::store(double nr)
-{
-  int error= 0;
-  if (nr < 0.0 || nr > 99991231235959.0)
-  {
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, 
-                         ER_WARN_DATA_OUT_OF_RANGE,
-                         nr, MYSQL_TIMESTAMP_DATETIME);
-    nr= 0.0;
-    error= 1;
-  }
-  error|= Field_datetime::store((longlong) rint(nr), FALSE);
-  return error;
-}
-
-
-int Field_datetime::store(longlong nr, bool unsigned_val)
-{
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  MYSQL_TIME not_used;
-  int error;
-  longlong initial_nr= nr;
-  THD *thd= table ? table->in_use : current_thd;
-
-  nr= number_to_datetime(nr, &not_used, (TIME_FUZZY_DATE |
-                                         (thd->variables.sql_mode &
-                                          (MODE_NO_ZERO_IN_DATE |
-                                           MODE_NO_ZERO_DATE |
-                                           MODE_INVALID_DATES))), &error);
-
-  if (nr == LL(-1))
-  {
-    nr= 0;
-    error= 2;
-  }
-
-  if (error)
-    set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
-                         error == 2 ? ER_WARN_DATA_OUT_OF_RANGE :
-                         WARN_DATA_TRUNCATED, initial_nr,
-                         MYSQL_TIMESTAMP_DATETIME, 1);
-
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    int8store(ptr,nr);
-  }
-  else
-#endif
-    longlongstore(ptr,nr);
-  return error;
-}
-
-
-int Field_datetime::store_time(MYSQL_TIME *ltime,timestamp_type time_type)
+void Field_datetime::store_TIME(MYSQL_TIME *ltime)
 {
-  ASSERT_COLUMN_MARKED_FOR_WRITE_OR_COMPUTED;
-  longlong tmp;
-  int error= 0;
-  /*
-    We don't perform range checking here since values stored in TIME
-    structure always fit into DATETIME range.
-  */
-  if (time_type == MYSQL_TIMESTAMP_DATE ||
-      time_type == MYSQL_TIMESTAMP_DATETIME)
-  {
-    tmp=((ltime->year*10000L+ltime->month*100+ltime->day)*LL(1000000)+
-	 (ltime->hour*10000L+ltime->minute*100+ltime->second));
-    if (check_date(ltime, tmp != 0,
-                   (TIME_FUZZY_DATE |
-                    (current_thd->variables.sql_mode &
-                     (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
-                      MODE_INVALID_DATES))), &error))
-    {
-      char buff[MAX_DATE_STRING_REP_LENGTH];
-      String str(buff, sizeof(buff), &my_charset_latin1);
-      tmp= 0;
-      make_datetime((DATE_TIME_FORMAT *) 0, ltime, &str);
-      set_datetime_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED,
-                           str.ptr(), str.length(), MYSQL_TIMESTAMP_DATETIME,1);
-    }
-  }
-  else
-  {
-    tmp=0;
-    error= 1;
-    set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, WARN_DATA_TRUNCATED, 1);
-  }
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    int8store(ptr,tmp);
-  }
-  else
-#endif
-    longlongstore(ptr,tmp);
-  return error;
+  ulonglong tmp= TIME_to_ulonglong_datetime(ltime);
+  int8store(ptr,tmp);
 }
 
 bool Field_datetime::send_binary(Protocol *protocol)
 {
   MYSQL_TIME tm;
   Field_datetime::get_date(&tm, TIME_FUZZY_DATE);
-  return protocol->store(&tm);
+  return protocol->store(&tm, 0);
 }
-
-
+  
+  
 double Field_datetime::val_real(void)
 {
   return (double) Field_datetime::val_int();
@@ -6048,12 +5720,7 @@ longlong Field_datetime::val_int(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
   longlong j;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    j=sint8korr(ptr);
-  else
-#endif
-    longlongget(j,ptr);
+  j=sint8korr(ptr);
   return j;
 }
 
@@ -6061,20 +5728,16 @@ longlong Field_datetime::val_int(void)
 String *Field_datetime::val_str(String *val_buffer,
 				String *val_ptr __attribute__((unused)))
 {
-  ASSERT_COLUMN_MARKED_FOR_READ;
   val_buffer->alloc(field_length);
   val_buffer->length(field_length);
+
+  ASSERT_COLUMN_MARKED_FOR_READ;
   ulonglong tmp;
   long part1,part2;
   char *pos;
   int part3;
 
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-    tmp=sint8korr(ptr);
-  else
-#endif
-    longlongget(tmp,ptr);
+  tmp= Field_datetime::val_int();
 
   /*
     Avoid problem with slow longlong arithmetic and sprintf
@@ -6124,65 +5787,148 @@ bool Field_datetime::get_date(MYSQL_TIME *ltime, uint fuzzydate)
   ltime->day=		(int) (part1%100);
   ltime->month= 	(int) (part1/100%100);
   ltime->year= 		(int) (part1/10000);
-  return (!(fuzzydate & TIME_FUZZY_DATE) && (!ltime->month || !ltime->day)) ? 1 : 0;
-}
-
-bool Field_datetime::get_time(MYSQL_TIME *ltime)
-{
-  return Field_datetime::get_date(ltime,0);
+  if (!tmp)
+    return fuzzydate & TIME_NO_ZERO_DATE;
+  if (!ltime->month || !ltime->day)
+    return !(fuzzydate & TIME_FUZZY_DATE);
+  return 0;
 }
 
 int Field_datetime::cmp(const uchar *a_ptr, const uchar *b_ptr)
 {
   longlong a,b;
-#ifdef WORDS_BIGENDIAN
-  if (table && table->s->db_low_byte_first)
-  {
-    a=sint8korr(a_ptr);
-    b=sint8korr(b_ptr);
-  }
-  else
-#endif
-  {
-    longlongget(a,a_ptr);
-    longlongget(b,b_ptr);
-  }
+  a=sint8korr(a_ptr);
+  b=sint8korr(b_ptr);
   return ((ulonglong) a < (ulonglong) b) ? -1 :
     ((ulonglong) a > (ulonglong) b) ? 1 : 0;
 }
 
 void Field_datetime::sort_string(uchar *to,uint length __attribute__((unused)))
 {
-#ifdef WORDS_BIGENDIAN
-  if (!table || !table->s->db_low_byte_first)
+  to[0] = ptr[7];
+  to[1] = ptr[6];
+  to[2] = ptr[5];
+  to[3] = ptr[4];
+  to[4] = ptr[3];
+  to[5] = ptr[2];
+  to[6] = ptr[1];
+  to[7] = ptr[0];
+}
+
+
+void Field_datetime::sql_type(String &res) const
+{
+  res.set_ascii(STRING_WITH_LEN("datetime"));
+}
+
+void Field_datetime_hires::store_TIME(MYSQL_TIME *ltime)
+{
+  ulonglong packed= sec_part_shift(pack_time(ltime), dec);
+  store_bigendian(packed, ptr, Field_datetime_hires::pack_length());
+}
+
+int Field_datetime_hires::store_decimal(const my_decimal *d)
+{
+  ulonglong nr;
+  ulong sec_part;
+  int error;
+  MYSQL_TIME ltime;
+  longlong tmp;
+  THD *thd= table->in_use;
+  ErrConvDecimal str(d);
+
+  if (my_decimal2seconds(d, &nr, &sec_part))
   {
-    to[0] = ptr[0];
-    to[1] = ptr[1];
-    to[2] = ptr[2];
-    to[3] = ptr[3];
-    to[4] = ptr[4];
-    to[5] = ptr[5];
-    to[6] = ptr[6];
-    to[7] = ptr[7];
+    tmp= -1;
+    error= 2;
   }
   else
-#endif
-  {
-    to[0] = ptr[7];
-    to[1] = ptr[6];
-    to[2] = ptr[5];
-    to[3] = ptr[4];
-    to[4] = ptr[3];
-    to[5] = ptr[2];
-    to[6] = ptr[1];
-    to[7] = ptr[0];
-  }
+    tmp= number_to_datetime(nr, sec_part, &ltime, (TIME_FUZZY_DATE |
+                                          (thd->variables.sql_mode &
+                                           (MODE_NO_ZERO_IN_DATE |
+                                            MODE_NO_ZERO_DATE |
+                                            MODE_INVALID_DATES))), &error);
+
+  return store_TIME_with_warning(&ltime, &str, error, tmp != -1);
 }
 
+bool Field_datetime_hires::send_binary(Protocol *protocol)
+{
+  MYSQL_TIME ltime;
+  Field_datetime_hires::get_date(&ltime, TIME_FUZZY_DATE);
+  return protocol->store(&ltime, dec);
+}
 
-void Field_datetime::sql_type(String &res) const
+
+double Field_datetime_hires::val_real(void)
 {
-  res.set_ascii(STRING_WITH_LEN("datetime"));
+  MYSQL_TIME ltime;
+  Field_datetime_hires::get_date(&ltime, TIME_FUZZY_DATE);
+  return TIME_to_double(&ltime);
+}
+
+longlong Field_datetime_hires::val_int(void)
+{
+  MYSQL_TIME ltime;
+  Field_datetime_hires::get_date(&ltime, TIME_FUZZY_DATE);
+  return TIME_to_ulonglong_datetime(&ltime);
+}
+
+
+String *Field_datetime_hires::val_str(String *str,
+                                      String *unused __attribute__((unused)))
+{
+  MYSQL_TIME ltime;
+  Field_datetime_hires::get_date(&ltime, TIME_FUZZY_DATE);
+  str->alloc(field_length+1);
+  str->length(field_length);
+  my_datetime_to_str(&ltime, (char*) str->ptr(), dec);
+  str->set_charset(&my_charset_bin);
+  return str;
+}
+
+bool Field_datetime_hires::get_date(MYSQL_TIME *ltime, uint fuzzydate)
+{
+  ulonglong packed= read_bigendian(ptr, Field_datetime_hires::pack_length());
+  unpack_time(sec_part_unshift(packed, dec), ltime);
+  if (!packed)
+    return fuzzydate & TIME_NO_ZERO_DATE;
+  if (!ltime->month || !ltime->day)
+    return !(fuzzydate & TIME_FUZZY_DATE);
+  return 0;
+}
+
+uint32 Field_datetime_hires::pack_length() const
+{
+  return datetime_hires_bytes[dec];
+}
+
+int Field_datetime_hires::cmp(const uchar *a_ptr, const uchar *b_ptr)
+{
+  ulonglong a=read_bigendian(a_ptr, Field_datetime_hires::pack_length());
+  ulonglong b=read_bigendian(b_ptr, Field_datetime_hires::pack_length());
+  return a < b ? -1 : a > b ? 1 : 0;
+}
+
+void Field_datetime_hires::sort_string(uchar *to,
+                                       uint length __attribute__((unused)))
+{
+  DBUG_ASSERT(length == Field_datetime_hires::pack_length());
+  memcpy(to, ptr, length);
+}
+
+
+void Field_datetime_hires::sql_type(String &res) const
+{
+  CHARSET_INFO *cs=res.charset();
+  res.length(cs->cset->snprintf(cs, (char*) res.ptr(), res.alloced_length(),
+                                "datetime(%u)", dec));
+}
+
+void Field_datetime_hires::make_field(Send_field *field)
+{
+  Field::make_field(field);
+  field->decimals= dec;
 }
 
 /****************************************************************************
@@ -6564,9 +6310,7 @@ void Field_string::sql_type(String &res) const
 }
 
 
-uchar *Field_string::pack(uchar *to, const uchar *from,
-                          uint max_length,
-                          bool low_byte_first __attribute__((unused)))
+uchar *Field_string::pack(uchar *to, const uchar *from, uint max_length)
 {
   uint length=      min(field_length,max_length);
   uint local_char_length= max_length/field_charset->mbmaxlen;
@@ -6622,10 +6366,7 @@ uchar *Field_string::pack(uchar *to, const uchar *from,
    @return  New pointer into memory based on from + length of the data
 */
 const uchar *
-Field_string::unpack(uchar *to,
-                     const uchar *from,
-                     uint param_data,
-                     bool low_byte_first __attribute__((unused)))
+Field_string::unpack(uchar *to, const uchar *from, uint param_data)
 {
   uint from_length, length;
 
@@ -7039,9 +6780,7 @@ uint32 Field_varstring::data_length()
   Here the number of length bytes are depending on the given max_length
 */
 
-uchar *Field_varstring::pack(uchar *to, const uchar *from,
-                             uint max_length,
-                             bool low_byte_first __attribute__((unused)))
+uchar *Field_varstring::pack(uchar *to, const uchar *from, uint max_length)
 {
   uint length= length_bytes == 1 ? (uint) *from : uint2korr(from);
   set_if_smaller(max_length, field_length);
@@ -7076,9 +6815,7 @@ uchar *Field_varstring::pack(uchar *to, const uchar *from,
    @return  New pointer into memory based on from + length of the data
 */
 const uchar *
-Field_varstring::unpack(uchar *to, const uchar *from,
-                        uint param_data,
-                        bool low_byte_first __attribute__((unused)))
+Field_varstring::unpack(uchar *to, const uchar *from, uint param_data)
 {
   uint length;
   uint l_bytes= (param_data && (param_data < field_length)) ? 
@@ -7252,103 +6989,15 @@ Field_blob::Field_blob(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
 }
 
 
-void Field_blob::store_length(uchar *i_ptr, 
-                              uint i_packlength, 
-                              uint32 i_number, 
-                              bool low_byte_first)
+void Field_blob::store_length(uchar *i_ptr, uint i_packlength, uint32 i_number)
 {
-  switch (i_packlength) {
-  case 1:
-    i_ptr[0]= (uchar) i_number;
-    break;
-  case 2:
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first)
-    {
-      int2store(i_ptr,(unsigned short) i_number);
-    }
-    else
-#endif
-      shortstore(i_ptr,(unsigned short) i_number);
-    break;
-  case 3:
-    int3store(i_ptr,i_number);
-    break;
-  case 4:
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first)
-    {
-      int4store(i_ptr,i_number);
-    }
-    else
-#endif
-      longstore(i_ptr,i_number);
-  }
+  store_lowendian(i_number, i_ptr, i_packlength);
 }
 
 
-uint32 Field_blob::get_length(const uchar *pos, uint packlength_arg, bool low_byte_first)
+uint32 Field_blob::get_length(const uchar *pos, uint packlength_arg)
 {
-  switch (packlength_arg) {
-  case 1:
-    return (uint32) pos[0];
-  case 2:
-    {
-      uint16 tmp;
-#ifdef WORDS_BIGENDIAN
-      if (low_byte_first)
-	tmp=sint2korr(pos);
-      else
-#endif
-	shortget(tmp,pos);
-      return (uint32) tmp;
-    }
-  case 3:
-    return (uint32) uint3korr(pos);
-  case 4:
-    {
-      uint32 tmp;
-#ifdef WORDS_BIGENDIAN
-      if (low_byte_first)
-	tmp=uint4korr(pos);
-      else
-#endif
-	longget(tmp,pos);
-      return (uint32) tmp;
-    }
-  }
-  /* When expanding this, see also MAX_FIELD_BLOBLENGTH. */
-  return 0;					// Impossible
-}
-
-
-/**
-  Put a blob length field into a record buffer.
-
-  Depending on the maximum length of a blob, its length field is
-  put into 1 to 4 bytes. This is a property of the blob object,
-  described by 'packlength'.
-
-  @param pos                 Pointer into the record buffer.
-  @param length              The length value to put.
-*/
-
-void Field_blob::put_length(uchar *pos, uint32 length)
-{
-  switch (packlength) {
-  case 1:
-    *pos= (char) length;
-    break;
-  case 2:
-    int2store(pos, length);
-    break;
-  case 3:
-    int3store(pos, length);
-    break;
-  case 4:
-    int4store(pos, length);
-    break;
-  }
+  return (uint32)read_lowendian(pos, packlength_arg);
 }
 
 
@@ -7689,20 +7338,7 @@ void Field_blob::sort_string(uchar *to,uint length)
       length-= packlength;
       pos= to+length;
 
-      switch (packlength) {
-      case 1:
-        *pos= (char) blob_length;
-        break;
-      case 2:
-        mi_int2store(pos, blob_length);
-        break;
-      case 3:
-        mi_int3store(pos, blob_length);
-        break;
-      case 4:
-        mi_int4store(pos, blob_length);
-        break;
-      }
+      store_bigendian(blob_length, pos, packlength);
     }
     memcpy(&blob, ptr+packlength, sizeof(char*));
     
@@ -7732,8 +7368,7 @@ void Field_blob::sql_type(String &res) const
   }
 }
 
-uchar *Field_blob::pack(uchar *to, const uchar *from,
-                        uint max_length, bool low_byte_first)
+uchar *Field_blob::pack(uchar *to, const uchar *from, uint max_length)
 {
   uchar *save= ptr;
   ptr= (uchar*) from;
@@ -7744,7 +7379,7 @@ uchar *Field_blob::pack(uchar *to, const uchar *from,
     length given is smaller than the actual length of the blob, we
     just store the initial bytes of the blob.
   */
-  store_length(to, packlength, min(length, max_length), low_byte_first);
+  store_length(to, packlength, min(length, max_length));
 
   /*
     Store the actual blob data, which will occupy 'length' bytes.
@@ -7776,18 +7411,14 @@ uchar *Field_blob::pack(uchar *to, const uchar *from,
 
    @return  New pointer into memory based on from + length of the data
 */
-const uchar *Field_blob::unpack(uchar *to, 
-                                const uchar *from,
-                                uint param_data,
-                                bool low_byte_first)
+const uchar *Field_blob::unpack(uchar *to, const uchar *from, uint param_data)
 {
   DBUG_ENTER("Field_blob::unpack");
-  DBUG_PRINT("enter", ("to: 0x%lx; from: 0x%lx;"
-                       " param_data: %u; low_byte_first: %d",
-                       (ulong) to, (ulong) from, param_data, low_byte_first));
+  DBUG_PRINT("enter", ("to: 0x%lx; from: 0x%lx; param_data: %u",
+                       (ulong) to, (ulong) from, param_data));
   uint const master_packlength=
     param_data > 0 ? param_data & 0xFF : packlength;
-  uint32 const length= get_length(from, master_packlength, low_byte_first);
+  uint32 const length= get_length(from, master_packlength);
   DBUG_DUMP("packed", from, length + master_packlength);
   bitmap_set_bit(table->write_set, field_index);
   store(reinterpret_cast<const char*>(from) + master_packlength,
@@ -7796,6 +7427,7 @@ const uchar *Field_blob::unpack(uchar *to,
   DBUG_RETURN(from + master_packlength + length);
 }
 
+
 uint Field_blob::packed_col_length(const uchar *data_ptr, uint length)
 {
   if (length > 255)
@@ -7933,39 +7565,7 @@ enum ha_base_keytype Field_enum::key_type() const
 
 void Field_enum::store_type(ulonglong value)
 {
-  switch (packlength) {
-  case 1: ptr[0]= (uchar) value;  break;
-  case 2:
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int2store(ptr,(unsigned short) value);
-  }
-  else
-#endif
-    shortstore(ptr,(unsigned short) value);
-  break;
-  case 3: int3store(ptr,(long) value); break;
-  case 4:
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int4store(ptr,value);
-  }
-  else
-#endif
-    longstore(ptr,(long) value);
-  break;
-  case 8:
-#ifdef WORDS_BIGENDIAN
-  if (table->s->db_low_byte_first)
-  {
-    int8store(ptr,value);
-  }
-  else
-#endif
-    longlongstore(ptr,value); break;
-  }
+  store_lowendian(value, ptr, packlength);
 }
 
 
@@ -8051,46 +7651,7 @@ double Field_enum::val_real(void)
 longlong Field_enum::val_int(void)
 {
   ASSERT_COLUMN_MARKED_FOR_READ;
-  switch (packlength) {
-  case 1:
-    return (longlong) ptr[0];
-  case 2:
-  {
-    uint16 tmp;
-#ifdef WORDS_BIGENDIAN
-    if (table->s->db_low_byte_first)
-      tmp=sint2korr(ptr);
-    else
-#endif
-      shortget(tmp,ptr);
-    return (longlong) tmp;
-  }
-  case 3:
-    return (longlong) uint3korr(ptr);
-  case 4:
-  {
-    uint32 tmp;
-#ifdef WORDS_BIGENDIAN
-    if (table->s->db_low_byte_first)
-      tmp=uint4korr(ptr);
-    else
-#endif
-      longget(tmp,ptr);
-    return (longlong) tmp;
-  }
-  case 8:
-  {
-    longlong tmp;
-#ifdef WORDS_BIGENDIAN
-    if (table->s->db_low_byte_first)
-      tmp=sint8korr(ptr);
-    else
-#endif
-      longlongget(tmp,ptr);
-    return tmp;
-  }
-  }
-  return 0;					// impossible
+  return read_lowendian(ptr, packlength);
 }
 
 
@@ -8399,51 +7960,20 @@ uint Field_enum::is_equal(Create_field *new_field)
 }
 
 
-uchar *Field_enum::pack(uchar *to, const uchar *from,
-                        uint max_length, bool low_byte_first)
+uchar *Field_enum::pack(uchar *to, const uchar *from, uint max_length)
 {
   DBUG_ENTER("Field_enum::pack");
   DBUG_PRINT("debug", ("packlength: %d", packlength));
   DBUG_DUMP("from", from, packlength);
-
-  switch (packlength)
-  {
-  case 1:
-    *to = *from;
-    DBUG_RETURN(to + 1);
-  case 2: DBUG_RETURN(pack_int16(to, from, low_byte_first));
-  case 3: DBUG_RETURN(pack_int24(to, from, low_byte_first));
-  case 4: DBUG_RETURN(pack_int32(to, from, low_byte_first));
-  case 8: DBUG_RETURN(pack_int64(to, from, low_byte_first));
-  default:
-    DBUG_ASSERT(0);
-  }
-  MY_ASSERT_UNREACHABLE();
-  DBUG_RETURN(NULL);
+  DBUG_RETURN(pack_int(to, from, packlength));
 }
 
-const uchar *Field_enum::unpack(uchar *to, const uchar *from,
-                                uint param_data, bool low_byte_first)
+const uchar *Field_enum::unpack(uchar *to, const uchar *from, uint param_data)
 {
   DBUG_ENTER("Field_enum::unpack");
   DBUG_PRINT("debug", ("packlength: %d", packlength));
   DBUG_DUMP("from", from, packlength);
-
-  switch (packlength)
-  {
-  case 1:
-    *to = *from;
-    DBUG_RETURN(from + 1);
-
-  case 2: DBUG_RETURN(unpack_int16(to, from, low_byte_first));
-  case 3: DBUG_RETURN(unpack_int24(to, from, low_byte_first));
-  case 4: DBUG_RETURN(unpack_int32(to, from, low_byte_first));
-  case 8: DBUG_RETURN(unpack_int64(to, from, low_byte_first));
-  default:
-    DBUG_ASSERT(0);
-  }
-  MY_ASSERT_UNREACHABLE();
-  DBUG_RETURN(NULL);
+  DBUG_RETURN(unpack_int(to, from, packlength));
 }
 
 
@@ -8892,8 +8422,7 @@ void Field_bit::sql_type(String &res) const
 
 
 uchar *
-Field_bit::pack(uchar *to, const uchar *from, uint max_length,
-                bool low_byte_first __attribute__((unused)))
+Field_bit::pack(uchar *to, const uchar *from, uint max_length)
 {
   DBUG_ASSERT(max_length > 0);
   uint length;
@@ -8940,8 +8469,7 @@ Field_bit::pack(uchar *to, const uchar *from, uint max_length,
    @return  New pointer into memory based on from + length of the data
 */
 const uchar *
-Field_bit::unpack(uchar *to, const uchar *from, uint param_data,
-                  bool low_byte_first __attribute__((unused)))
+Field_bit::unpack(uchar *to, const uchar *from, uint param_data)
 {
   DBUG_ENTER("Field_bit::unpack");
   DBUG_PRINT("enter", ("to: %p, from: %p, param_data: 0x%x",
@@ -9492,28 +9020,14 @@ bool Create_field::init(THD *thd, char *fld_name, enum_field_types fld_type,
     }
     break;
   case MYSQL_TYPE_TIMESTAMP:
-    if (fld_length == NULL)
-    {
-      length= MAX_DATETIME_WIDTH;
-    }
-    else if (length != MAX_DATETIME_WIDTH)
+    if (length > MAX_DATETIME_PRECISION)
     {
-      /*
-        We support only even TIMESTAMP lengths less or equal than 14
-        and 19 as length of 4.1 compatible representation.  Silently 
-        shrink it to MAX_DATETIME_COMPRESSED_WIDTH.
-      */
-      DBUG_ASSERT(MAX_DATETIME_COMPRESSED_WIDTH < UINT_MAX);
-      if (length != UINT_MAX)  /* avoid overflow; is safe because of min() */
-        length= ((length+1)/2)*2;
-      length= min(length, MAX_DATETIME_COMPRESSED_WIDTH);
+      my_error(ER_TOO_BIG_PRECISION, MYF(0), length, fld_name,
+               MAX_DATETIME_PRECISION);
+      DBUG_RETURN(TRUE);
     }
-    flags|= ZEROFILL_FLAG | UNSIGNED_FLAG;
-    /*
-      Since we silently rewrite down to MAX_DATETIME_COMPRESSED_WIDTH bytes,
-      the parser should not raise errors unless bizzarely large. 
-     */
-    max_field_charlength= UINT_MAX;
+    length+= MAX_DATETIME_WIDTH + (length ? 1 : 0);
+    flags|= UNSIGNED_FLAG;
 
     if (fld_default_value)
     {
@@ -9561,10 +9075,22 @@ bool Create_field::init(THD *thd, char *fld_name, enum_field_types fld_type,
     length= MAX_DATE_WIDTH;
     break;
   case MYSQL_TYPE_TIME:
-    length= 10;
+    if (length > MAX_DATETIME_PRECISION)
+    {
+      my_error(ER_TOO_BIG_PRECISION, MYF(0), length, fld_name,
+               MAX_DATETIME_PRECISION);
+      DBUG_RETURN(TRUE);
+    }
+    length+= MIN_TIME_WIDTH + (length ? 1 : 0);
     break;
   case MYSQL_TYPE_DATETIME:
-    length= MAX_DATETIME_WIDTH;
+    if (length > MAX_DATETIME_PRECISION)
+    {
+      my_error(ER_TOO_BIG_PRECISION, MYF(0), length, fld_name,
+               MAX_DATETIME_PRECISION);
+      DBUG_RETURN(TRUE);
+    }
+    length+= MAX_DATETIME_WIDTH + (length ? 1 : 0);
     break;
   case MYSQL_TYPE_SET:
     {
@@ -9684,14 +9210,22 @@ uint32 calc_pack_length(enum_field_types type,uint32 length)
   case MYSQL_TYPE_TINY	: return 1;
   case MYSQL_TYPE_SHORT : return 2;
   case MYSQL_TYPE_INT24:
-  case MYSQL_TYPE_NEWDATE:
-  case MYSQL_TYPE_TIME:   return 3;
+  case MYSQL_TYPE_NEWDATE: return 3;
+  case MYSQL_TYPE_TIME:   return length > MIN_TIME_WIDTH
+                            ? time_hires_bytes[length - 1 - MIN_TIME_WIDTH]
+                            : 3;
   case MYSQL_TYPE_TIMESTAMP:
+                          return length > MAX_DATETIME_WIDTH
+                            ? 4 + sec_part_bytes[length - 1 - MAX_DATETIME_WIDTH]
+                            : 4;
   case MYSQL_TYPE_DATE:
   case MYSQL_TYPE_LONG	: return 4;
   case MYSQL_TYPE_FLOAT : return sizeof(float);
   case MYSQL_TYPE_DOUBLE: return sizeof(double);
   case MYSQL_TYPE_DATETIME:
+                          return length > MAX_DATETIME_WIDTH
+                            ? datetime_hires_bytes[length - 1 - MAX_DATETIME_WIDTH]
+                            : 8;
   case MYSQL_TYPE_LONGLONG: return 8;	/* Don't crash if no longlong */
   case MYSQL_TYPE_NULL	: return 0;
   case MYSQL_TYPE_TINY_BLOB:	return 1+portable_sizeof_char_ptr;
@@ -9722,6 +9256,7 @@ uint pack_length_to_packflag(uint type)
   return 0;					// This shouldn't happen
 }
 
+
 Field *make_field(TABLE_SHARE *share, uchar *ptr, uint32 field_length,
 		  uchar *null_pos, uchar null_bit,
 		  uint pack_flag,
@@ -9871,9 +9406,12 @@ Field *make_field(TABLE_SHARE *share, uchar *ptr, uint32 field_length,
 			      f_is_zerofill(pack_flag) != 0,
 			      f_is_dec(pack_flag) == 0);
   case MYSQL_TYPE_TIMESTAMP:
-    return new Field_timestamp(ptr,field_length, null_pos, null_bit,
-                               unireg_check, field_name, share,
-                               field_charset);
+  {
+    uint dec= field_length > MAX_DATETIME_WIDTH ?
+                       field_length - MAX_DATETIME_WIDTH - 1: 0;
+    return new_Field_timestamp(ptr, null_pos, null_bit, unireg_check,
+                               field_name, share, dec, field_charset);
+  }
   case MYSQL_TYPE_YEAR:
     return new Field_year(ptr,field_length,null_pos,null_bit,
 			  unireg_check, field_name);
@@ -9884,11 +9422,19 @@ Field *make_field(TABLE_SHARE *share, uchar *ptr, uint32 field_length,
     return new Field_newdate(ptr,null_pos,null_bit,
 			     unireg_check, field_name, field_charset);
   case MYSQL_TYPE_TIME:
-    return new Field_time(ptr,null_pos,null_bit,
-			  unireg_check, field_name, field_charset);
+  {
+    uint dec= field_length > MIN_TIME_WIDTH ?
+                       field_length - MIN_TIME_WIDTH - 1: 0;
+    return new_Field_time(ptr, null_pos, null_bit, unireg_check,
+                              field_name, dec, field_charset);
+  }
   case MYSQL_TYPE_DATETIME:
-    return new Field_datetime(ptr,null_pos,null_bit,
-			      unireg_check, field_name, field_charset);
+  {
+    uint dec= field_length > MAX_DATETIME_WIDTH ?
+                       field_length - MAX_DATETIME_WIDTH - 1: 0;
+    return new_Field_datetime(ptr, null_pos, null_bit, unireg_check,
+                              field_name, dec, field_charset);
+  }
   case MYSQL_TYPE_NULL:
     return new Field_null(ptr, field_length, unireg_check, field_name,
                           field_charset);
@@ -9997,6 +9543,7 @@ Create_field::Create_field(Field *old_field,Field *orig_field)
   }
 }
 
+
 /**
   maximum possible character length for blob.
   
@@ -10086,14 +9633,9 @@ uint32 Field_blob::max_display_length()
 
     if count_cuted_fields == CHECK_FIELD_IGNORE then we ignore notes.
     This allows us to avoid notes in optimisation, like convert_constant_item().
-
-  @retval
-    1 if count_cuted_fields == CHECK_FIELD_IGNORE and error level is not NOTE
-  @retval
-    0 otherwise
 */
 
-bool 
+void 
 Field::set_warning(MYSQL_ERROR::enum_warning_level level, uint code,
                    int cuted_increment)
 {
@@ -10107,9 +9649,7 @@ Field::set_warning(MYSQL_ERROR::enum_warning_level level, uint code,
     thd->cuted_fields+= cuted_increment;
     push_warning_printf(thd, level, code, ER(code), field_name,
                         thd->warning_info->current_row_for_warning());
-    return 0;
   }
-  return level >= MYSQL_ERROR::WARN_LEVEL_WARN;
 }
 
 
@@ -10119,7 +9659,6 @@ Field::set_warning(MYSQL_ERROR::enum_warning_level level, uint code,
   @param level            level of message (Note/Warning/Error)
   @param code             error code of message to be produced
   @param str              string value which we tried to save
-  @param str_length       length of string which we tried to save
   @param ts_type          type of datetime value (datetime/date/time)
   @param cuted_increment  whenever we should increase cut fields count or not
 
@@ -10127,80 +9666,42 @@ Field::set_warning(MYSQL_ERROR::enum_warning_level level, uint code,
     This function will always produce some warning but won't increase cut
     fields counter if count_cuted_fields ==FIELD_CHECK_IGNORE for current
     thread.
-*/
-
-void 
-Field::set_datetime_warning(MYSQL_ERROR::enum_warning_level level, uint code, 
-                            const char *str, uint str_length, 
-                            timestamp_type ts_type, int cuted_increment)
-{
-  THD *thd= table ? table->in_use : current_thd;
-  if ((thd->really_abort_on_warning() &&
-       level >= MYSQL_ERROR::WARN_LEVEL_WARN) ||
-      set_warning(level, code, cuted_increment))
-    make_truncated_value_warning(thd, level, str, str_length, ts_type,
-                                 field_name);
-}
-
-
-/**
-  Produce warning or note about integer datetime value saved into field.
 
-  @param level            level of message (Note/Warning/Error)
-  @param code             error code of message to be produced
-  @param nr               numeric value which we tried to save
-  @param ts_type          type of datetime value (datetime/date/time)
-  @param cuted_increment  whenever we should increase cut fields count or not
+    See also bug#2336
 
-  @note
-    This function will always produce some warning but won't increase cut
-    fields counter if count_cuted_fields == FIELD_CHECK_IGNORE for current
-    thread.
 */
 
-void 
-Field::set_datetime_warning(MYSQL_ERROR::enum_warning_level level, uint code, 
-                            longlong nr, timestamp_type ts_type,
-                            int cuted_increment)
+void Field::set_datetime_warning(MYSQL_ERROR::enum_warning_level level,
+                                 uint code, const ErrConv *str,
+                                 timestamp_type ts_type, int cuted_increment)
 {
-  THD *thd= table ? table->in_use : current_thd;
-  if (thd->really_abort_on_warning() ||
-      set_warning(level, code, cuted_increment))
-  {
-    char str_nr[22];
-    char *str_end= longlong10_to_str(nr, str_nr, -10);
-    make_truncated_value_warning(thd, level, str_nr, (uint) (str_end - str_nr), 
-                                 ts_type, field_name);
-  }
+  THD *thd= table->in_use;
+  if (thd->really_abort_on_warning() && level >= MYSQL_ERROR::WARN_LEVEL_WARN)
+    make_truncated_value_warning(thd, level, str, ts_type, field_name);
+  else
+    set_warning(level, code, cuted_increment);
 }
 
 
-/**
-  Produce warning or note about double datetime data saved into field.
-
-  @param level            level of message (Note/Warning/Error)
-  @param code             error code of message to be produced
-  @param nr               double value which we tried to save
-  @param ts_type          type of datetime value (datetime/date/time)
-
-  @note
-    This function will always produce some warning but won't increase cut
-    fields counter if count_cuted_fields == FIELD_CHECK_IGNORE for current
-    thread.
+/*
+  @brief
+  Return possible keys for a field
+
+  @details
+  Return bit map of keys over this field which can be used by the range
+  optimizer. For a field of a generic table such keys are all keys that starts
+  from this field. For a field of a materialized derived table/view such keys
+  are all keys in which this field takes a part. This is less restrictive as
+  keys for a materialized derived table/view are generated on the fly from
+  present fields, thus the case when a field for the beginning of a key is
+  absent is impossible.
+
+  @return map of possible keys
 */
 
-void 
-Field::set_datetime_warning(MYSQL_ERROR::enum_warning_level level, uint code, 
-                            double nr, timestamp_type ts_type)
+key_map Field::get_possible_keys()
 {
-  THD *thd= table ? table->in_use : current_thd;
-  if (thd->really_abort_on_warning() ||
-      set_warning(level, code, 1))
-  {
-    /* DBL_DIG is enough to print '-[digits].E+###' */
-    char str_nr[DBL_DIG + 8];
-    uint str_len= sprintf(str_nr, "%g", nr);
-    make_truncated_value_warning(thd, level, str_nr, str_len, ts_type,
-                                 field_name);
-  }
+  DBUG_ASSERT(table->pos_in_table_list);
+  return (table->pos_in_table_list->is_materialized_derived() ?
+          part_of_key : key_start);
 }
diff --git a/sql/field.h b/sql/field.h
index 0282deb9a3d..728eb6f3f49 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -1,7 +1,7 @@
 #ifndef FIELD_INCLUDED
 #define FIELD_INCLUDED
-
-/* Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010 Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,8 +31,6 @@
 #include "my_decimal.h"                         /* my_decimal */
 #include "sql_error.h"                          /* MYSQL_ERROR */
 
-#define DATETIME_DEC                     6
-
 class Send_field;
 class Protocol;
 class Create_field;
@@ -65,8 +63,13 @@ enum Derivation
 #define my_charset_numeric      my_charset_latin1
 #define MY_REPERTOIRE_NUMERIC   MY_REPERTOIRE_ASCII
 
+struct ha_field_option_struct;
+
 struct st_cache_field;
 int field_conv(Field *to,Field *from);
+int truncate_double(double *nr, uint field_length, uint dec,
+                    bool unsigned_flag, double max_value);
+longlong double_to_longlong(double nr, bool unsigned_flag, bool *error);
 
 inline uint get_enum_pack_length(int elements)
 {
@@ -165,10 +168,11 @@ public:
   */
   TABLE *table;                                 // Pointer for table
   TABLE *orig_table;                            // Pointer to original table
-  const char	**table_name, *field_name;
+  const char * const *table_name;
+  const char *field_name;
   /** reference to the list of options or NULL */
   engine_option_value *option_list;
-  void *option_struct;                  /* structure with parsed options */
+  ha_field_option_struct *option_struct;   /* structure with parsed options */
   LEX_STRING	comment;
   /* Field is part of the following keys */
   key_map	key_start, part_of_key, part_of_key_not_clustered;
@@ -231,7 +235,9 @@ public:
   virtual int  store(double nr)=0;
   virtual int  store(longlong nr, bool unsigned_val)=0;
   virtual int  store_decimal(const my_decimal *d)=0;
-  virtual int store_time(MYSQL_TIME *ltime, timestamp_type t_type);
+  virtual int  store_time_dec(MYSQL_TIME *ltime, uint dec);
+  int store_time(MYSQL_TIME *ltime)
+  { return store_time_dec(ltime, TIME_SECOND_PART_DIGITS); }
   int store(const char *to, uint length, CHARSET_INFO *cs,
             enum_check_fields check_level);
   virtual double val_real(void)=0;
@@ -259,7 +265,6 @@ public:
   virtual bool str_needs_quotes() { return FALSE; }
   virtual Item_result result_type () const=0;
   virtual Item_result cmp_type () const { return result_type(); }
-  virtual Item_result cast_to_int_type () const { return result_type(); }
   static bool type_can_have_key_part(enum_field_types);
   static enum_field_types field_type_merge(enum_field_types, enum_field_types);
   static Item_result result_merge_type(enum_field_types);
@@ -405,14 +410,6 @@ public:
   virtual void make_field(Send_field *);
   virtual void sort_string(uchar *buff,uint length)=0;
   virtual bool optimize_range(uint idx, uint part);
-  /*
-    This should be true for fields which, when compared with constant
-    items, can be casted to longlong. In this case we will at 'fix_fields'
-    stage cast the constant items to longlongs and at the execution stage
-    use field->val_int() for comparison.  Used to optimize clauses like
-    'a_column BETWEEN date_const, date_const'.
-  */
-  virtual bool can_be_compared_as_longlong() const { return FALSE; }
   virtual void free() {}
   virtual Field *new_field(MEM_ROOT *root, TABLE *new_table,
                            bool keep_type);
@@ -496,27 +493,25 @@ public:
   }
   virtual bool send_binary(Protocol *protocol);
 
-  virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
+  virtual uchar *pack(uchar *to, const uchar *from, uint max_length);
   /**
      @overload Field::pack(uchar*, const uchar*, uint, bool)
   */
   uchar *pack(uchar *to, const uchar *from)
   {
     DBUG_ENTER("Field::pack");
-    uchar *result= this->pack(to, from, UINT_MAX, table->s->db_low_byte_first);
+    uchar *result= this->pack(to, from, UINT_MAX);
     DBUG_RETURN(result);
   }
 
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data);
   /**
      @overload Field::unpack(uchar*, const uchar*, uint, bool)
   */
   const uchar *unpack(uchar* to, const uchar *from)
   {
     DBUG_ENTER("Field::unpack");
-    const uchar *result= unpack(to, from, 0U, table->s->db_low_byte_first);
+    const uchar *result= unpack(to, from, 0);
     DBUG_RETURN(result);
   }
 
@@ -532,7 +527,7 @@ public:
   void copy_from_tmp(int offset);
   uint fill_cache_field(struct st_cache_field *copy);
   virtual bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
-  virtual bool get_time(MYSQL_TIME *ltime);
+  bool get_time(MYSQL_TIME *ltime) { return get_date(ltime, TIME_TIME_ONLY); }
   virtual CHARSET_INFO *charset(void) const { return &my_charset_bin; }
   virtual CHARSET_INFO *charset_for_protocol(void) const
   { return binary() ? &my_charset_bin : charset(); }
@@ -543,25 +538,25 @@ public:
   { return DERIVATION_IMPLICIT; }
   virtual uint repertoire(void) const { return MY_REPERTOIRE_UNICODE30; }
   virtual void set_derivation(enum Derivation derivation_arg) { }
-  bool set_warning(MYSQL_ERROR::enum_warning_level, unsigned int code,
+  virtual int set_time() { return 1; }
+  void set_warning(MYSQL_ERROR::enum_warning_level, unsigned int code,
                    int cuted_increment);
   void set_datetime_warning(MYSQL_ERROR::enum_warning_level, uint code, 
-                            const char *str, uint str_len,
-                            timestamp_type ts_type, int cuted_increment);
-  void set_datetime_warning(MYSQL_ERROR::enum_warning_level, uint code, 
-                            longlong nr, timestamp_type ts_type,
+                            const ErrConv *str, timestamp_type ts_type,
                             int cuted_increment);
-  void set_datetime_warning(MYSQL_ERROR::enum_warning_level, const uint code, 
-                            double nr, timestamp_type ts_type);
   inline bool check_overflow(int op_result)
   {
     return (op_result == E_DEC_OVERFLOW);
   }
   int warn_if_overflow(int op_result);
+  void set_table_name(String *alias)
+  {
+    table_name= &alias->Ptr;
+  }
   void init(TABLE *table_arg)
   {
     orig_table= table= table_arg;
-    table_name= &table_arg->alias;
+    set_table_name(&table_arg->alias);
   }
 
   /* maximum possible display length */
@@ -590,8 +585,16 @@ public:
     DBUG_ASSERT(0);
     return GEOM_GEOMETRY;
   }
+
+  key_map get_possible_keys();
+
   /* Hash value */
   virtual void hash(ulong *nr, ulong *nr2);
+
+  /* Check whether the field can be used as a join attribute in hash join */
+  virtual bool hash_join_is_possible() { return TRUE; }
+  virtual bool eq_cmp_as_binary() { return TRUE; }
+
   friend int cre_myisam(char * name, register TABLE *form, uint options,
 			ulonglong auto_increment_value);
   friend class Copy_field;
@@ -635,143 +638,34 @@ private:
   { return 0; }
 
 protected:
-  static void handle_int16(uchar *to, const uchar *from,
-                           bool low_byte_first_from, bool low_byte_first_to)
-  {
-    int16 val;
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_from)
-      val = sint2korr(from);
-    else
-#endif
-      shortget(val, from);
-
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_to)
-      int2store(to, val);
-    else
-#endif
-      shortstore(to, val);
-  }
-
-  static void handle_int24(uchar *to, const uchar *from,
-                           bool low_byte_first_from, bool low_byte_first_to)
-  {
-    int32 val;
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_from)
-      val = sint3korr(from);
-    else
-#endif
-      val= (from[0] << 16) + (from[1] << 8) + from[2];
-
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_to)
-      int2store(to, val);
-    else
-#endif
-    {
-      to[0]= 0xFF & (val >> 16);
-      to[1]= 0xFF & (val >> 8);
-      to[2]= 0xFF & val;
-    }
-  }
-
-  /*
-    Helper function to pack()/unpack() int32 values
-  */
-  static void handle_int32(uchar *to, const uchar *from,
-                           bool low_byte_first_from, bool low_byte_first_to)
-  {
-    int32 val;
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_from)
-      val = sint4korr(from);
-    else
-#endif
-      longget(val, from);
-
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_to)
-      int4store(to, val);
-    else
-#endif
-      longstore(to, val);
-  }
-
-  /*
-    Helper function to pack()/unpack() int64 values
-  */
-  static void handle_int64(uchar* to, const uchar *from,
-                           bool low_byte_first_from, bool low_byte_first_to)
-  {
-    int64 val;
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_from)
-      val = sint8korr(from);
-    else
-#endif
-      longlongget(val, from);
-
-#ifdef WORDS_BIGENDIAN
-    if (low_byte_first_to)
-      int8store(to, val);
-    else
-#endif
-      longlongstore(to, val);
-  }
-
-  uchar *pack_int16(uchar *to, const uchar *from, bool low_byte_first_to)
-  {
-    handle_int16(to, from, table->s->db_low_byte_first, low_byte_first_to);
-    return to  + sizeof(int16);
-  }
-
-  const uchar *unpack_int16(uchar* to, const uchar *from,
-                            bool low_byte_first_from)
-  {
-    handle_int16(to, from, low_byte_first_from, table->s->db_low_byte_first);
-    return from + sizeof(int16);
-  }
-
-  uchar *pack_int24(uchar *to, const uchar *from, bool low_byte_first_to)
-  {
-    handle_int24(to, from, table->s->db_low_byte_first, low_byte_first_to);
-    return to + 3;
-  }
-
-  const uchar *unpack_int24(uchar* to, const uchar *from,
-                            bool low_byte_first_from)
-  {
-    handle_int24(to, from, low_byte_first_from, table->s->db_low_byte_first);
-    return from + 3;
-  }
-
-  uchar *pack_int32(uchar *to, const uchar *from, bool low_byte_first_to)
-  {
-    handle_int32(to, from, table->s->db_low_byte_first, low_byte_first_to);
-    return to  + sizeof(int32);
-  }
-
-  const uchar *unpack_int32(uchar* to, const uchar *from,
-                            bool low_byte_first_from)
+  uchar *pack_int(uchar *to, const uchar *from, size_t size)
   {
-    handle_int32(to, from, low_byte_first_from, table->s->db_low_byte_first);
-    return from + sizeof(int32);
+    memcpy(to, from, size);
+    return to + size;
   }
 
-  uchar *pack_int64(uchar* to, const uchar *from, bool low_byte_first_to)
+  const uchar *unpack_int(uchar* to, const uchar *from, size_t size)
   {
-    handle_int64(to, from, table->s->db_low_byte_first, low_byte_first_to);
-    return to + sizeof(int64);
+    memcpy(to, from, size);
+    return from + size;
   }
 
-  const uchar *unpack_int64(uchar* to, const uchar *from,
-                            bool low_byte_first_from)
-  {
-    handle_int64(to, from, low_byte_first_from, table->s->db_low_byte_first);
-    return from + sizeof(int64);
-  }
+  uchar *pack_int16(uchar *to, const uchar *from)
+  { return pack_int(to, from, 2); }
+  const uchar *unpack_int16(uchar* to, const uchar *from)
+  { return unpack_int(to, from, 2); }
+  uchar *pack_int24(uchar *to, const uchar *from)
+  { return pack_int(to, from, 3); }
+  const uchar *unpack_int24(uchar* to, const uchar *from)
+  { return unpack_int(to, from, 3); }
+  uchar *pack_int32(uchar *to, const uchar *from)
+  { return pack_int(to, from, 4); }
+  const uchar *unpack_int32(uchar* to, const uchar *from)
+  { return unpack_int(to, from, 4); }
+  uchar *pack_int64(uchar* to, const uchar *from)
+  { return pack_int(to, from, 8); }
+  const uchar *unpack_int64(uchar* to, const uchar *from)
+  { return unpack_int(to, from, 8); }
 
   bool field_flags_are_binary()
   {
@@ -789,7 +683,7 @@ public:
 	    uchar null_bit_arg, utype unireg_check_arg,
 	    const char *field_name_arg,
             uint8 dec_arg, bool zero_arg, bool unsigned_arg);
-  Item_result result_type () const { return REAL_RESULT; }
+  enum Item_result result_type () const { return INT_RESULT; }
   enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
   uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
   CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
@@ -810,6 +704,7 @@ public:
                           field_metadata, length));
     return length;
   }
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
   int check_int(CHARSET_INFO *cs, const char *str, int length,
                 const char *int_end, int error);
   bool get_int(CHARSET_INFO *cs, const char *from, uint len, 
@@ -859,9 +754,9 @@ public:
   my_decimal *val_decimal(my_decimal *);
   virtual bool str_needs_quotes() { return TRUE; }
   uint is_equal(Create_field *new_field);
+  bool eq_cmp_as_binary() { return test(flags & BINARY_FLAG); }
 };
 
-
 /* base class for Field_string, Field_varstring and Field_blob */
 
 class Field_longstr :public Field_str
@@ -894,15 +789,13 @@ public:
                field_name_arg, dec_arg, zero_arg, unsigned_arg),
     not_fixed(dec_arg >= NOT_FIXED_DEC)
     {}
+  Item_result result_type () const { return REAL_RESULT; }
   int store_decimal(const my_decimal *);
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
+  bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
   my_decimal *val_decimal(my_decimal *);
-  int truncate(double *nr, double max_length);
   uint32 max_display_length() { return field_length; }
   uint size_of() const { return sizeof(*this); }
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first);
-  virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length, bool low_byte_first);
 };
 
 
@@ -931,15 +824,13 @@ public:
   void overflow(bool negative);
   bool zero_pack() const { return 0; }
   void sql_type(String &str) const;
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first)
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data)
   {
-    return Field::unpack(to, from, param_data, low_byte_first);
+    return Field::unpack(to, from, param_data);
   }
-  virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length, bool low_byte_first)
+  virtual uchar *pack(uchar* to, const uchar *from, uint max_length)
   {
-    return Field::pack(to, from, max_length, low_byte_first);
+    return Field::pack(to, from, max_length);
   }
 };
 
@@ -974,7 +865,7 @@ public:
   int  store(const char *to, uint length, CHARSET_INFO *charset);
   int  store(double nr);
   int  store(longlong nr, bool unsigned_val);
-  int store_time(MYSQL_TIME *ltime, timestamp_type t_type);
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
   int  store_decimal(const my_decimal *);
   double val_real(void);
   longlong val_int(void);
@@ -992,8 +883,7 @@ public:
   bool compatible_field_size(uint field_metadata, Relay_log_info *rli,
                              uint16 mflags, int *order_var);
   uint is_equal(Create_field *new_field);
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data);
   static Field *create_from_item (Item *);
 };
 
@@ -1008,7 +898,6 @@ public:
 	       unireg_check_arg, field_name_arg,
 	       0, zero_arg,unsigned_arg)
     {}
-  enum Item_result result_type () const { return INT_RESULT; }
   enum_field_types type() const { return MYSQL_TYPE_TINY;}
   enum ha_base_keytype key_type() const
     { return unsigned_flag ? HA_KEYTYPE_BINARY : HA_KEYTYPE_INT8; }
@@ -1026,15 +915,13 @@ public:
   void sql_type(String &str) const;
   uint32 max_display_length() { return 4; }
 
-  virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length, bool low_byte_first)
+  virtual uchar *pack(uchar* to, const uchar *from, uint max_length)
   {
     *to= *from;
     return to + 1;
   }
 
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first)
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data)
   {
     *to= *from;
     return from + 1;
@@ -1057,7 +944,6 @@ public:
     :Field_num((uchar*) 0, len_arg, maybe_null_arg ? (uchar*) "": 0,0,
 	       NONE, field_name_arg, 0, 0, unsigned_arg)
     {}
-  enum Item_result result_type () const { return INT_RESULT; }
   enum_field_types type() const { return MYSQL_TYPE_SHORT;}
   enum ha_base_keytype key_type() const
     { return unsigned_flag ? HA_KEYTYPE_USHORT_INT : HA_KEYTYPE_SHORT_INT;}
@@ -1075,17 +961,11 @@ public:
   void sql_type(String &str) const;
   uint32 max_display_length() { return 6; }
 
-  virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length, bool low_byte_first)
-  {
-    return pack_int16(to, from, low_byte_first);
-  }
+  virtual uchar *pack(uchar* to, const uchar *from, uint max_length)
+  { return pack_int16(to, from); }
 
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first)
-  {
-    return unpack_int16(to, from, low_byte_first);
-  }
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data)
+  { return unpack_int16(to, from); }
 };
 
 class Field_medium :public Field_num {
@@ -1098,7 +978,6 @@ public:
 	       unireg_check_arg, field_name_arg,
 	       0, zero_arg,unsigned_arg)
     {}
-  enum Item_result result_type () const { return INT_RESULT; }
   enum_field_types type() const { return MYSQL_TYPE_INT24;}
   enum ha_base_keytype key_type() const
     { return unsigned_flag ? HA_KEYTYPE_UINT24 : HA_KEYTYPE_INT24; }
@@ -1116,16 +995,14 @@ public:
   void sql_type(String &str) const;
   uint32 max_display_length() { return 8; }
 
-  virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length, bool low_byte_first)
+  virtual uchar *pack(uchar* to, const uchar *from, uint max_length)
   {
-    return Field::pack(to, from, max_length, low_byte_first);
+    return Field::pack(to, from, max_length);
   }
 
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first)
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data)
   {
-    return Field::unpack(to, from, param_data, low_byte_first);
+    return Field::unpack(to, from, param_data);
   }
 };
 
@@ -1145,7 +1022,6 @@ public:
     :Field_num((uchar*) 0, len_arg, maybe_null_arg ? (uchar*) "": 0,0,
 	       NONE, field_name_arg,0,0,unsigned_arg)
     {}
-  enum Item_result result_type () const { return INT_RESULT; }
   enum_field_types type() const { return MYSQL_TYPE_LONG;}
   enum ha_base_keytype key_type() const
     { return unsigned_flag ? HA_KEYTYPE_ULONG_INT : HA_KEYTYPE_LONG_INT; }
@@ -1163,21 +1039,18 @@ public:
   void sql_type(String &str) const;
   uint32 max_display_length() { return MY_INT32_NUM_DECIMAL_DIGITS; }
   virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length __attribute__((unused)),
-                      bool low_byte_first)
+                      uint max_length __attribute__((unused)))
   {
-    return pack_int32(to, from, low_byte_first);
+    return pack_int32(to, from);
   }
   virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data __attribute__((unused)),
-                              bool low_byte_first)
+                              uint param_data __attribute__((unused)))
   {
-    return unpack_int32(to, from, low_byte_first);
+    return unpack_int32(to, from);
   }
 };
 
 
-#ifdef HAVE_LONG_LONG
 class Field_longlong :public Field_num {
 public:
   Field_longlong(uchar *ptr_arg, uint32 len_arg, uchar *null_ptr_arg,
@@ -1194,7 +1067,6 @@ public:
     :Field_num((uchar*) 0, len_arg, maybe_null_arg ? (uchar*) "": 0,0,
 	       NONE, field_name_arg,0,0,unsigned_arg)
     {}
-  enum Item_result result_type () const { return INT_RESULT; }
   enum_field_types type() const { return MYSQL_TYPE_LONGLONG;}
   enum ha_base_keytype key_type() const
     { return unsigned_flag ? HA_KEYTYPE_ULONGLONG : HA_KEYTYPE_LONGLONG; }
@@ -1214,22 +1086,18 @@ public:
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 8; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   uint32 max_display_length() { return 20; }
   virtual uchar *pack(uchar* to, const uchar *from,
-                      uint max_length  __attribute__((unused)),
-                      bool low_byte_first)
+                      uint max_length  __attribute__((unused)))
   {
-    return pack_int64(to, from, low_byte_first);
+    return pack_int64(to, from);
   }
   virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data __attribute__((unused)),
-                              bool low_byte_first)
+                              uint param_data __attribute__((unused)))
   {
-    return unpack_int64(to, from, low_byte_first);
+    return unpack_int64(to, from);
   }
 };
-#endif
 
 
 class Field_float :public Field_real {
@@ -1337,10 +1205,14 @@ public:
   void sql_type(String &str) const;
   uint size_of() const { return sizeof(*this); }
   uint32 max_display_length() { return 4; }
+  void move_field_offset(my_ptrdiff_t ptr_diff) {}
 };
 
 
 class Field_timestamp :public Field_str {
+protected:
+  int store_TIME_with_warning(THD *, MYSQL_TIME *, const ErrConv *,
+                              bool, bool);
 public:
   Field_timestamp(uchar *ptr_arg, uint32 len_arg,
                   uchar *null_ptr_arg, uchar null_bit_arg,
@@ -1351,7 +1223,7 @@ public:
   enum_field_types type() const { return MYSQL_TYPE_TIMESTAMP;}
   bool match_collation_to_optimize_range() const { return FALSE; }
   enum ha_base_keytype key_type() const { return HA_KEYTYPE_ULONG_INT; }
-  enum Item_result cmp_type () const { return INT_RESULT; }
+  enum Item_result cmp_type () const { return TIME_RESULT; }
   enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
   uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
   CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
@@ -1359,7 +1231,7 @@ public:
   int  store(const char *to,uint length,CHARSET_INFO *charset);
   int  store(double nr);
   int  store(longlong nr, bool unsigned_val);
-  int  reset(void) { ptr[0]=ptr[1]=ptr[2]=ptr[3]=0; return 0; }
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
@@ -1368,9 +1240,9 @@ public:
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 4; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   bool zero_pack() const { return 0; }
-  void set_time();
+  uint decimals() const { return 0; }
+  virtual int set_time();
   virtual void set_default()
   {
     if (table->timestamp_field == this &&
@@ -1380,46 +1252,65 @@ public:
       Field::set_default();
   }
   /* Get TIMESTAMP field value as seconds since begging of Unix Epoch */
-  inline long get_timestamp(bool *null_value)
-  {
-    if ((*null_value= is_null()))
-      return 0;
-#ifdef WORDS_BIGENDIAN
-    if (table && table->s->db_low_byte_first)
-      return sint4korr(ptr);
-#endif
-    long tmp;
-    longget(tmp,ptr);
-    return tmp;
-  }
-  inline void store_timestamp(my_time_t timestamp)
+  virtual my_time_t get_timestamp(ulong *sec_part) const;
+  virtual void store_TIME(my_time_t timestamp, ulong sec_part)
   {
-#ifdef WORDS_BIGENDIAN
-    if (table && table->s->db_low_byte_first)
-    {
-      int4store(ptr,timestamp);
-    }
-    else
-#endif
-      longstore(ptr,(uint32) timestamp);
+    int4store(ptr,timestamp);
   }
   bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
-  bool get_time(MYSQL_TIME *ltime);
   timestamp_auto_set_type get_auto_set_type() const;
   uchar *pack(uchar *to, const uchar *from,
-              uint max_length __attribute__((unused)), bool low_byte_first)
+              uint max_length __attribute__((unused)))
   {
-    return pack_int32(to, from, low_byte_first);
+    return pack_int32(to, from);
   }
   const uchar *unpack(uchar* to, const uchar *from,
-                      uint param_data __attribute__((unused)),
-                      bool low_byte_first)
+                      uint param_data __attribute__((unused)))
   {
-    return unpack_int32(to, from, low_byte_first);
+    return unpack_int32(to, from);
   }
 };
 
 
+class Field_timestamp_hires :public Field_timestamp {
+  uint dec;
+public:
+  Field_timestamp_hires(uchar *ptr_arg,
+                  uchar *null_ptr_arg, uchar null_bit_arg,
+                  enum utype unireg_check_arg, const char *field_name_arg,
+                  TABLE_SHARE *share, uint dec_arg, CHARSET_INFO *cs) :
+  Field_timestamp(ptr_arg, MAX_DATETIME_WIDTH + dec_arg + 1, null_ptr_arg,
+                  null_bit_arg, unireg_check_arg, field_name_arg, share, cs),
+  dec(dec_arg)
+  {
+    DBUG_ASSERT(dec);
+    DBUG_ASSERT(dec <= TIME_SECOND_PART_DIGITS);
+  }
+  void sql_type(String &str) const;
+  my_time_t get_timestamp(ulong *sec_part) const;
+  void store_TIME(my_time_t timestamp, ulong sec_part);
+  int store_decimal(const my_decimal *d);
+  double val_real(void);
+  String *val_str(String*,String *);
+  my_decimal* val_decimal(my_decimal*);
+  bool send_binary(Protocol *protocol);
+  int cmp(const uchar *,const uchar *);
+  void sort_string(uchar *buff,uint length);
+  uint decimals() const { return dec; }
+  int set_time();
+  enum ha_base_keytype key_type() const { return HA_KEYTYPE_BINARY; }
+  void make_field(Send_field *field);
+  uint32 pack_length() const;
+  uchar *pack(uchar *to, const uchar *from, uint max_length)
+  { return Field::pack(to, from, max_length); }
+  const uchar *unpack(uchar* to, const uchar *from, uint param_data)
+  { return Field::unpack(to, from, param_data); }
+  uint size_of() const { return sizeof(*this); }
+  bool eq_def(Field *field)
+  { return Field_str::eq_def(field) && dec == field->decimals(); }
+};
+
+
 class Field_year :public Field_tiny {
 public:
   Field_year(uchar *ptr_arg, uint32 len_arg, uchar *null_ptr_arg,
@@ -1432,90 +1323,94 @@ public:
   int  store(const char *to,uint length,CHARSET_INFO *charset);
   int  store(double nr);
   int  store(longlong nr, bool unsigned_val);
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
+  bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
   bool send_binary(Protocol *protocol);
+  uint32 max_display_length() { return field_length; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
 };
 
 
-class Field_date :public Field_str {
+class Field_temporal: public Field_str {
+protected:
+  int store_TIME_with_warning(MYSQL_TIME *ltime, const ErrConv *str,
+                              int was_cut, int have_smth_to_conv);
+  virtual void store_TIME(MYSQL_TIME *ltime) = 0;
 public:
-  Field_date(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
-	     enum utype unireg_check_arg, const char *field_name_arg,
-	     CHARSET_INFO *cs)
-    :Field_str(ptr_arg, MAX_DATE_WIDTH, null_ptr_arg, null_bit_arg,
-	       unireg_check_arg, field_name_arg, cs)
+  Field_temporal(uchar *ptr_arg,uint32 len_arg, uchar *null_ptr_arg,
+                 uchar null_bit_arg, utype unireg_check_arg,
+                 const char *field_name_arg, CHARSET_INFO *charset_arg)
+    :Field_str(ptr_arg, len_arg, null_ptr_arg, null_bit_arg, unireg_check_arg,
+               field_name_arg, charset_arg)
     { flags|= BINARY_FLAG; }
-  Field_date(bool maybe_null_arg, const char *field_name_arg,
-             CHARSET_INFO *cs)
-    :Field_str((uchar*) 0, MAX_DATE_WIDTH, maybe_null_arg ? (uchar*) "": 0,0,
-	       NONE, field_name_arg, cs) { flags|= BINARY_FLAG; }
-  enum_field_types type() const { return MYSQL_TYPE_DATE;}
-  bool match_collation_to_optimize_range() const { return FALSE; }
-  enum ha_base_keytype key_type() const { return HA_KEYTYPE_ULONG_INT; }
-  enum Item_result cmp_type () const { return INT_RESULT; }
   enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
   uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
   CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
   bool binary() const { return 1; }
-  int store(const char *to,uint length,CHARSET_INFO *charset);
-  int store(double nr);
-  int store(longlong nr, bool unsigned_val);
+  bool match_collation_to_optimize_range() const { return FALSE; }
+  enum Item_result cmp_type () const { return TIME_RESULT; }
+  int  store(const char *to,uint length,CHARSET_INFO *charset);
+  int  store(double nr);
+  int  store(longlong nr, bool unsigned_val);
+  int  store_time_dec(MYSQL_TIME *ltime, uint dec);
+  my_decimal *val_decimal(my_decimal*);
+  bool eq_def(Field *field)
+  {
+    return (Field_str::eq_def(field) && decimals() == field->decimals());
+  }
+};
+
+class Field_date :public Field_temporal {
+  void store_TIME(MYSQL_TIME *ltime);
+public:
+  Field_date(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
+	     enum utype unireg_check_arg, const char *field_name_arg,
+	     CHARSET_INFO *cs)
+    :Field_temporal(ptr_arg, MAX_DATE_WIDTH, null_ptr_arg, null_bit_arg,
+                    unireg_check_arg, field_name_arg, cs) {}
+  enum_field_types type() const { return MYSQL_TYPE_DATE;}
+  enum ha_base_keytype key_type() const { return HA_KEYTYPE_ULONG_INT; }
   int reset(void) { ptr[0]=ptr[1]=ptr[2]=ptr[3]=0; return 0; }
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
-  bool get_time(MYSQL_TIME *ltime);
+  uint decimals() const { return 0; }
   bool send_binary(Protocol *protocol);
   int cmp(const uchar *,const uchar *);
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 4; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   bool zero_pack() const { return 1; }
   uchar *pack(uchar* to, const uchar *from,
-              uint max_length __attribute__((unused)), bool low_byte_first)
+              uint max_length __attribute__((unused)))
   {
-    return pack_int32(to, from, low_byte_first);
+    return pack_int32(to, from);
   }
   const uchar *unpack(uchar* to, const uchar *from,
-                      uint param_data __attribute__((unused)),
-                      bool low_byte_first)
+                      uint param_data __attribute__((unused)))
   {
-    return unpack_int32(to, from, low_byte_first);
+    return unpack_int32(to, from);
   }
 };
 
 
-class Field_newdate :public Field_str {
+class Field_newdate :public Field_temporal {
+  void store_TIME(MYSQL_TIME *ltime);
 public:
   Field_newdate(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
 		enum utype unireg_check_arg, const char *field_name_arg,
 		CHARSET_INFO *cs)
-    :Field_str(ptr_arg, 10, null_ptr_arg, null_bit_arg,
-	       unireg_check_arg, field_name_arg, cs)
-    { flags|= BINARY_FLAG; }
-  Field_newdate(bool maybe_null_arg, const char *field_name_arg,
-                CHARSET_INFO *cs)
-    :Field_str((uchar*) 0,10, maybe_null_arg ? (uchar*) "": 0,0,
-               NONE, field_name_arg, cs) { flags|= BINARY_FLAG; }
+    :Field_temporal(ptr_arg, MAX_DATE_WIDTH, null_ptr_arg, null_bit_arg,
+                    unireg_check_arg, field_name_arg, cs)
+    {}
   enum_field_types type() const { return MYSQL_TYPE_DATE;}
   enum_field_types real_type() const { return MYSQL_TYPE_NEWDATE; }
-  bool match_collation_to_optimize_range() const { return FALSE; }
   enum ha_base_keytype key_type() const { return HA_KEYTYPE_UINT24; }
-  enum Item_result cmp_type () const { return INT_RESULT; }
-  enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
-  uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
-  CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
-  bool binary() const { return 1; }
-  int  store(const char *to,uint length,CHARSET_INFO *charset);
-  int  store(double nr);
-  int  store(longlong nr, bool unsigned_val);
-  int store_time(MYSQL_TIME *ltime, timestamp_type type);
   int reset(void) { ptr[0]=ptr[1]=ptr[2]=0; return 0; }
+  uint decimals() const { return 0; }
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
@@ -1524,85 +1419,85 @@ public:
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 3; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   bool zero_pack() const { return 1; }
   bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
-  bool get_time(MYSQL_TIME *ltime);
 };
 
 
-class Field_time :public Field_str {
+class Field_time :public Field_temporal {
+  void store_TIME(MYSQL_TIME *ltime);
 public:
-  Field_time(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
-	     enum utype unireg_check_arg, const char *field_name_arg,
-	     CHARSET_INFO *cs)
-    :Field_str(ptr_arg, 8, null_ptr_arg, null_bit_arg,
-	       unireg_check_arg, field_name_arg, cs)
-    { flags|= BINARY_FLAG; }
-  Field_time(bool maybe_null_arg, const char *field_name_arg,
-             CHARSET_INFO *cs)
-    :Field_str((uchar*) 0,8, maybe_null_arg ? (uchar*) "": 0,0,
-	       NONE, field_name_arg, cs) { flags|= BINARY_FLAG; }
+  Field_time(uchar *ptr_arg, uint length_arg, uchar *null_ptr_arg,
+             uchar null_bit_arg, enum utype unireg_check_arg,
+             const char *field_name_arg, CHARSET_INFO *cs)
+    :Field_temporal(ptr_arg, length_arg, null_ptr_arg, null_bit_arg,
+                    unireg_check_arg, field_name_arg, cs)
+    {}
   enum_field_types type() const { return MYSQL_TYPE_TIME;}
-  bool match_collation_to_optimize_range() const { return FALSE; }
   enum ha_base_keytype key_type() const { return HA_KEYTYPE_INT24; }
-  enum Item_result cmp_type () const { return INT_RESULT; }
-  enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
-  uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
-  CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
-  bool binary() const { return 1; }
-  int store_time(MYSQL_TIME *ltime, timestamp_type type);
+  int store_time_dec(MYSQL_TIME *ltime, uint dec);
   int store(const char *to,uint length,CHARSET_INFO *charset);
   int store(double nr);
   int store(longlong nr, bool unsigned_val);
-  int reset(void) { ptr[0]=ptr[1]=ptr[2]=0; return 0; }
+  uint decimals() const { return 0; }
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
   bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
   bool send_binary(Protocol *protocol);
-  bool get_time(MYSQL_TIME *ltime);
   int cmp(const uchar *,const uchar *);
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 3; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   bool zero_pack() const { return 1; }
 };
 
+class Field_time_hires :public Field_time {
+  uint dec;
+  longlong zero_point;
+  void store_TIME(MYSQL_TIME *ltime);
+public:
+  Field_time_hires(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
+             enum utype unireg_check_arg, const char *field_name_arg,
+             uint dec_arg, CHARSET_INFO *cs)
+    :Field_time(ptr_arg, MIN_TIME_WIDTH + dec_arg + 1, null_ptr_arg,
+                null_bit_arg, unireg_check_arg, field_name_arg, cs),
+     dec(dec_arg)
+  {
+    DBUG_ASSERT(dec);
+    DBUG_ASSERT(dec <= TIME_SECOND_PART_DIGITS);
+    zero_point= sec_part_shift(
+                   ((TIME_MAX_VALUE_SECONDS+1LL)*TIME_SECOND_PART_FACTOR), dec);
+  }
+  enum ha_base_keytype key_type() const { return HA_KEYTYPE_BINARY; }
+  uint decimals() const { return dec; }
+  int store_decimal(const my_decimal *d);
+  longlong val_int(void);
+  double val_real(void);
+  String *val_str(String*,String *);
+  int reset(void);
+  bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
+  bool send_binary(Protocol *protocol);
+  int cmp(const uchar *,const uchar *);
+  void sort_string(uchar *buff,uint length);
+  uint32 pack_length() const;
+  void sql_type(String &str) const;
+  void make_field(Send_field *);
+  uint size_of() const { return sizeof(*this); }
+};
 
-class Field_datetime :public Field_str {
+class Field_datetime :public Field_temporal {
+  void store_TIME(MYSQL_TIME *ltime);
 public:
-  Field_datetime(uchar *ptr_arg, uchar *null_ptr_arg, uchar null_bit_arg,
-		 enum utype unireg_check_arg, const char *field_name_arg,
-		 CHARSET_INFO *cs)
-    :Field_str(ptr_arg, MAX_DATETIME_WIDTH, null_ptr_arg, null_bit_arg,
-	       unireg_check_arg, field_name_arg, cs)
-    { flags|= BINARY_FLAG; }
-  Field_datetime(bool maybe_null_arg, const char *field_name_arg,
-		 CHARSET_INFO *cs)
-    :Field_str((uchar*) 0, MAX_DATETIME_WIDTH, maybe_null_arg ? (uchar*) "": 0,0,
-	       NONE, field_name_arg, cs) { flags|= BINARY_FLAG; }
+  Field_datetime(uchar *ptr_arg, uint length_arg, uchar *null_ptr_arg,
+                 uchar null_bit_arg, enum utype unireg_check_arg,
+                 const char *field_name_arg, CHARSET_INFO *cs)
+    :Field_temporal(ptr_arg, length_arg, null_ptr_arg, null_bit_arg,
+                    unireg_check_arg, field_name_arg, cs)
+    {}
   enum_field_types type() const { return MYSQL_TYPE_DATETIME;}
-  bool match_collation_to_optimize_range() const { return FALSE; }
-#ifdef HAVE_LONG_LONG
   enum ha_base_keytype key_type() const { return HA_KEYTYPE_ULONGLONG; }
-#endif
-  enum Item_result cmp_type () const { return INT_RESULT; }
-  enum Derivation derivation(void) const { return DERIVATION_NUMERIC; }
-  uint repertoire(void) const { return MY_REPERTOIRE_NUMERIC; }
-  CHARSET_INFO *charset(void) const { return &my_charset_numeric; }
-  bool binary() const { return 1; }
-  uint decimals() const { return DATETIME_DEC; }
-  int  store(const char *to,uint length,CHARSET_INFO *charset);
-  int  store(double nr);
-  int  store(longlong nr, bool unsigned_val);
-  int store_time(MYSQL_TIME *ltime, timestamp_type type);
-  int reset(void)
-  {
-    ptr[0]=ptr[1]=ptr[2]=ptr[3]=ptr[4]=ptr[5]=ptr[6]=ptr[7]=0;
-    return 0;
-  }
+  uint decimals() const { return 0; }
   double val_real(void);
   longlong val_int(void);
   String *val_str(String*,String *);
@@ -1611,24 +1506,98 @@ public:
   void sort_string(uchar *buff,uint length);
   uint32 pack_length() const { return 8; }
   void sql_type(String &str) const;
-  bool can_be_compared_as_longlong() const { return TRUE; }
   bool zero_pack() const { return 1; }
   bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
-  bool get_time(MYSQL_TIME *ltime);
   uchar *pack(uchar* to, const uchar *from,
-              uint max_length __attribute__((unused)), bool low_byte_first)
+              uint max_length __attribute__((unused)))
   {
-    return pack_int64(to, from, low_byte_first);
+    return pack_int64(to, from);
   }
   const uchar *unpack(uchar* to, const uchar *from,
-                      uint param_data __attribute__((unused)),
-                      bool low_byte_first)
+                      uint param_data __attribute__((unused)))
   {
-    return unpack_int64(to, from, low_byte_first);
+    return unpack_int64(to, from);
   }
 };
 
 
+class Field_datetime_hires :public Field_datetime {
+  void store_TIME(MYSQL_TIME *ltime);
+  uint dec;
+public:
+  Field_datetime_hires(uchar *ptr_arg, uchar *null_ptr_arg,
+                       uchar null_bit_arg, enum utype unireg_check_arg,
+                       const char *field_name_arg, uint dec_arg,
+                       CHARSET_INFO *cs)
+    :Field_datetime(ptr_arg, MAX_DATETIME_WIDTH + dec_arg + 1,
+                    null_ptr_arg, null_bit_arg, unireg_check_arg,
+                    field_name_arg, cs), dec(dec_arg)
+  {
+    DBUG_ASSERT(dec);
+    DBUG_ASSERT(dec <= TIME_SECOND_PART_DIGITS);
+  }
+  enum ha_base_keytype key_type() const { return HA_KEYTYPE_BINARY; }
+  uint decimals() const { return dec; }
+  void make_field(Send_field *field);
+  int store_decimal(const my_decimal *d);
+  double val_real(void);
+  longlong val_int(void);
+  String *val_str(String*,String *);
+  bool send_binary(Protocol *protocol);
+  int cmp(const uchar *,const uchar *);
+  void sort_string(uchar *buff,uint length);
+  uint32 pack_length() const;
+  void sql_type(String &str) const;
+  bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
+  uchar *pack(uchar *to, const uchar *from, uint max_length)
+  { return Field::pack(to, from, max_length); }
+  const uchar *unpack(uchar* to, const uchar *from, uint param_data)
+  { return Field::unpack(to, from, param_data); }
+  uint size_of() const { return sizeof(*this); }
+};
+
+static inline Field_timestamp *
+new_Field_timestamp(uchar *ptr, uchar *null_ptr, uchar null_bit,
+                    enum Field::utype unireg_check, const char *field_name,
+                    TABLE_SHARE *share, uint dec, CHARSET_INFO *cs)
+{
+  if (dec==0)
+    return new Field_timestamp(ptr, MAX_DATETIME_WIDTH, null_ptr, null_bit,
+                                unireg_check, field_name, share, cs);
+  if (dec == NOT_FIXED_DEC)
+    dec= MAX_DATETIME_PRECISION;
+  return new Field_timestamp_hires(ptr, null_ptr, null_bit, unireg_check,
+                                   field_name, share, dec, cs);
+}
+
+static inline Field_time *
+new_Field_time(uchar *ptr, uchar *null_ptr, uchar null_bit,
+               enum Field::utype unireg_check, const char *field_name,
+               uint dec, CHARSET_INFO *cs)
+{
+  if (dec == 0)
+    return new Field_time(ptr, MIN_TIME_WIDTH, null_ptr, null_bit,
+                          unireg_check, field_name, cs);
+  if (dec == NOT_FIXED_DEC)
+    dec= MAX_DATETIME_PRECISION;
+  return new Field_time_hires(ptr, null_ptr, null_bit,
+                                  unireg_check, field_name, dec, cs);
+}
+
+static inline Field_datetime *
+new_Field_datetime(uchar *ptr, uchar *null_ptr, uchar null_bit,
+                   enum Field::utype unireg_check,
+                   const char *field_name, uint dec, CHARSET_INFO *cs)
+{
+  if (dec == 0)
+    return new Field_datetime(ptr, MAX_DATETIME_WIDTH, null_ptr, null_bit,
+                              unireg_check, field_name, cs);
+  if (dec == NOT_FIXED_DEC)
+    dec= MAX_DATETIME_PRECISION;
+  return new Field_datetime_hires(ptr, null_ptr, null_bit,
+                                  unireg_check, field_name, dec, cs);
+}
+
 class Field_string :public Field_longstr {
 public:
   bool can_alter_field_type;
@@ -1674,9 +1643,8 @@ public:
   void sort_string(uchar *buff,uint length);
   void sql_type(String &str) const;
   virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+                      uint max_length);
+  virtual const uchar *unpack(uchar* to, const uchar *from, uint param_data);
   uint pack_length_from_metadata(uint field_metadata)
   {
     DBUG_PRINT("debug", ("field_metadata: 0x%04x", field_metadata));
@@ -1762,10 +1730,8 @@ public:
   uint get_key_image(uchar *buff,uint length, imagetype type);
   void set_key_image(const uchar *buff,uint length);
   void sql_type(String &str) const;
-  virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
-  virtual const uchar *unpack(uchar* to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  uchar *pack(uchar *to, const uchar *from, uint max_length);
+  const uchar *unpack(uchar* to, const uchar *from, uint param_data);
   int cmp_binary(const uchar *a,const uchar *b, uint32 max_length=~0L);
   int key_cmp(const uchar *,const uchar*);
   int key_cmp(const uchar *str, uint length);
@@ -1871,14 +1837,7 @@ public:
   int reset(void) { bzero(ptr, packlength+sizeof(uchar*)); return 0; }
   void reset_fields() { bzero((uchar*) &value,sizeof(value)); }
   uint32 get_field_buffer_size(void) { return value.alloced_length(); }
-#ifndef WORDS_BIGENDIAN
-  static
-#endif
-  void store_length(uchar *i_ptr, uint i_packlength, uint32 i_number, bool low_byte_first);
-  void store_length(uchar *i_ptr, uint i_packlength, uint32 i_number)
-  {
-    store_length(i_ptr, i_packlength, i_number, table->s->db_low_byte_first);
-  }
+  void store_length(uchar *i_ptr, uint i_packlength, uint32 i_number);
   inline void store_length(uint32 number)
   {
     store_length(ptr, packlength, number);
@@ -1892,15 +1851,14 @@ public:
 
      @returns The length in the row plus the size of the data.
   */
-  uint32 get_packed_size(const uchar *ptr_arg, bool low_byte_first)
-    {return packlength + get_length(ptr_arg, packlength, low_byte_first);}
+  uint32 get_packed_size(const uchar *ptr_arg)
+    {return packlength + get_length(ptr_arg, packlength);}
 
   inline uint32 get_length(uint row_offset= 0)
-  { return get_length(ptr+row_offset, this->packlength, table->s->db_low_byte_first); }
-  uint32 get_length(const uchar *ptr, uint packlength, bool low_byte_first);
+  { return get_length(ptr+row_offset, this->packlength); }
+  uint32 get_length(const uchar *ptr, uint packlength);
   uint32 get_length(const uchar *ptr_arg)
-  { return get_length(ptr_arg, this->packlength, table->s->db_low_byte_first); }
-  void put_length(uchar *pos, uint32 length);
+  { return get_length(ptr_arg, this->packlength); }
   inline void get_ptr(uchar **str)
     {
       memcpy(str, ptr+packlength, sizeof(uchar*));
@@ -1921,9 +1879,9 @@ public:
       memcpy(ptr_ofs+packlength, &data, sizeof(char*));
     }
   inline void set_ptr(uint32 length, uchar *data)
-    {
-      set_ptr_offset(0, length, data);
-    }
+  {
+    set_ptr_offset(0, length, data);
+  }
   uint get_key_image(uchar *buff,uint length, imagetype type);
   void set_key_image(const uchar *buff,uint length);
   void sql_type(String &str) const;
@@ -1940,10 +1898,8 @@ public:
     memcpy(ptr+packlength, &tmp, sizeof(char*));
     return 0;
   }
-  virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
-  virtual const uchar *unpack(uchar *to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  uchar *pack(uchar *to, const uchar *from, uint max_length);
+  const uchar *unpack(uchar *to, const uchar *from, uint param_data);
   uint packed_col_length(const uchar *col_ptr, uint length);
   uint max_packed_col_length(uint max_length);
   void free() { value.free(); }
@@ -2014,7 +1970,6 @@ public:
   enum_field_types type() const { return MYSQL_TYPE_STRING; }
   bool match_collation_to_optimize_range() const { return FALSE; }
   enum Item_result cmp_type () const { return INT_RESULT; }
-  enum Item_result cast_to_int_type () const { return INT_RESULT; }
   enum ha_base_keytype key_type() const;
   int  store(const char *to,uint length,CHARSET_INFO *charset);
   int  store(double nr);
@@ -2039,10 +1994,8 @@ public:
   /* enum and set are sorted as integers */
   CHARSET_INFO *sort_charset(void) const { return &my_charset_bin; }
 
-  virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
-  virtual const uchar *unpack(uchar *to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  virtual uchar *pack(uchar *to, const uchar *from, uint max_length);
+  virtual const uchar *unpack(uchar *to, const uchar *from, uint param_data);
 
 private:
   int do_save_field_metadata(uchar *first_byte);
@@ -2153,10 +2106,8 @@ public:
   bool compatible_field_size(uint metadata, Relay_log_info *rli,
                              uint16 mflags, int *order_var);
   void sql_type(String &str) const;
-  virtual uchar *pack(uchar *to, const uchar *from,
-                      uint max_length, bool low_byte_first);
-  virtual const uchar *unpack(uchar *to, const uchar *from,
-                              uint param_data, bool low_byte_first);
+  virtual uchar *pack(uchar *to, const uchar *from, uint max_length);
+  virtual const uchar *unpack(uchar *to, const uchar *from, uint param_data);
   virtual void set_default();
 
   Field *new_key_field(MEM_ROOT *root, TABLE *new_table,
@@ -2243,7 +2194,7 @@ public:
   Field *field;				// For alter table
   engine_option_value *option_list;
   /** structure with parsed options (for comparing fields in ALTER TABLE) */
-  void *option_struct;
+  ha_field_option_struct *option_struct;
 
   uint8 row,col,sc_length,interval_id;	// For rea_create_table
   uint	offset,pack_flag;
diff --git a/sql/field_conv.cc b/sql/field_conv.cc
index 2f6d84c999b..a0f6a159c05 100644
--- a/sql/field_conv.cc
+++ b/sql/field_conv.cc
@@ -205,6 +205,14 @@ static void do_skip(Copy_field *copy __attribute__((unused)))
 }
 
 
+/* 
+  Copy: (NULLable field) -> (NULLable field) 
+
+  note: if the record we're copying from is NULL-complemetned (i.e. 
+  from_field->table->null_row==1), it will also have all NULLable columns to be
+  set to NULLs, so we dont need to check table->null_row here.
+*/
+
 static void do_copy_null(Copy_field *copy)
 {
   if (*copy->from_null_ptr & copy->from_bit)
@@ -219,6 +227,10 @@ static void do_copy_null(Copy_field *copy)
   }
 }
 
+/*
+  Copy: (not-NULL field in table that can be NULL-complemented) -> (NULLable 
+     field)
+*/
 
 static void do_outer_field_null(Copy_field *copy)
 {
@@ -236,6 +248,7 @@ static void do_outer_field_null(Copy_field *copy)
 }
 
 
+/* Copy: (NULL-able field) -> (not NULL-able field) */
 static void do_copy_not_null(Copy_field *copy)
 {
   if (*copy->from_null_ptr & copy->from_bit)
@@ -249,6 +262,7 @@ static void do_copy_not_null(Copy_field *copy)
 }
 
 
+/* Copy: (non-NULLable field) -> (NULLable field) */
 static void do_copy_maybe_null(Copy_field *copy)
 {
   *copy->to_null_ptr&= ~copy->to_bit;
@@ -366,6 +380,14 @@ static void do_field_decimal(Copy_field *copy)
 }
 
 
+static void do_field_temporal(Copy_field *copy)
+{
+  MYSQL_TIME ltime;
+  copy->from_field->get_date(&ltime, TIME_FUZZY_DATE);
+  copy->to_field->store_time_dec(&ltime, copy->from_field->decimals());
+}
+
+
 /**
   string copy for single byte characters set when to string is shorter than
   from string.
@@ -450,7 +472,8 @@ static void do_varstring1(Copy_field *copy)
   if (length > copy->to_length- 1)
   {
     length=copy->to_length - 1;
-    if (copy->from_field->table->in_use->count_cuted_fields)
+    if (copy->from_field->table->in_use->count_cuted_fields &&
+        copy->to_field)
       copy->to_field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
                                   WARN_DATA_TRUNCATED, 1);
   }
@@ -486,7 +509,8 @@ static void do_varstring2(Copy_field *copy)
   if (length > copy->to_length- HA_KEY_BLOB_LENGTH)
   {
     length=copy->to_length-HA_KEY_BLOB_LENGTH;
-    if (copy->from_field->table->in_use->count_cuted_fields)
+    if (copy->from_field->table->in_use->count_cuted_fields &&
+        copy->to_field)
       copy->to_field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
                                   WARN_DATA_TRUNCATED, 1);
   }
@@ -550,9 +574,9 @@ void Copy_field::set(uchar *to,Field *from)
       do_copy=	  do_field_to_null_str;
   }
   else
-  {
+  { 
     to_null_ptr=  0;				// For easy debugging
-    do_copy=	  do_field_eq;
+    do_copy= do_field_eq;
   }
 }
 
@@ -560,7 +584,7 @@ void Copy_field::set(uchar *to,Field *from)
 /*
   To do: 
 
-  If 'save\ is set to true and the 'from' is a blob field, do_copy is set to
+  If 'save' is set to true and the 'from' is a blob field, do_copy is set to
   do_save_blob rather than do_conv_blob.  The only differences between them
   appears to be:
 
@@ -637,13 +661,11 @@ void Copy_field::set(Field *to,Field *from,bool save)
 Copy_field::Copy_func *
 Copy_field::get_copy_func(Field *to,Field *from)
 {
-  bool compatible_db_low_byte_first= (to->table->s->db_low_byte_first ==
-                                     from->table->s->db_low_byte_first);
   if (to->flags & BLOB_FLAG)
   {
     if (!(from->flags & BLOB_FLAG) || from->charset() != to->charset())
       return do_conv_blob;
-    if (from_length != to_length || !compatible_db_low_byte_first)
+    if (from_length != to_length)
     {
       // Correct pointer to point at char pointer
       to_ptr+=   to_length - to->table->s->blob_ptr_size;
@@ -658,6 +680,16 @@ Copy_field::get_copy_func(Field *to,Field *from)
       return do_field_int;
     if (to->result_type() == DECIMAL_RESULT)
       return do_field_decimal;
+    if (from->cmp_type() == TIME_RESULT)
+    {
+      /* If types are not 100 % identical then convert trough get_date() */
+      if (!to->eq_def(from) ||
+          ((to->table->in_use->variables.sql_mode &
+            (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE)) &&
+             mysql_type_to_time_type(to->type()) != MYSQL_TIMESTAMP_TIME))
+        return do_field_temporal;
+      /* Do binary copy */
+    }
     // Check if identical fields
     if (from->result_type() == STRING_RESULT)
     {
@@ -670,16 +702,7 @@ Copy_field::get_copy_func(Field *to,Field *from)
           to->type() == MYSQL_TYPE_VARCHAR && !to->has_charset())
         return do_field_varbinary_pre50;
 
-      /*
-        If we are copying date or datetime's we have to check the dates
-        if we don't allow 'all' dates.
-      */
-      if (to->real_type() != from->real_type() ||
-          !compatible_db_low_byte_first ||
-          ((to->table->in_use->variables.sql_mode &
-            (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE | MODE_INVALID_DATES)) &&
-           (to->type() == MYSQL_TYPE_DATE ||
-            to->type() == MYSQL_TYPE_DATETIME)))
+      if (to->real_type() != from->real_type())
       {
 	if (from->real_type() == MYSQL_TYPE_ENUM ||
 	    from->real_type() == MYSQL_TYPE_SET)
@@ -711,6 +734,9 @@ Copy_field::get_copy_func(Field *to,Field *from)
                                                     do_varstring1_mb) :
                   (from->charset()->mbmaxlen == 1 ? do_varstring2 :
                                                     do_varstring2_mb));
+        else 
+          return  (((Field_varstring*) from)->length_bytes == 1 ?
+                    do_varstring1 : do_varstring2);
       }
       else if (to_length < from_length)
 	return (from->charset()->mbmaxlen == 1 ?
@@ -723,8 +749,7 @@ Copy_field::get_copy_func(Field *to,Field *from)
       }
     }
     else if (to->real_type() != from->real_type() ||
-	     to_length != from_length ||
-             !compatible_db_low_byte_first)
+	     to_length != from_length)
     {
       if (to->real_type() == MYSQL_TYPE_DECIMAL ||
 	  to->result_type() == STRING_RESULT)
@@ -735,7 +760,7 @@ Copy_field::get_copy_func(Field *to,Field *from)
     }
     else
     {
-      if (!to->eq_def(from) || !compatible_db_low_byte_first)
+      if (!to->eq_def(from))
       {
 	if (to->real_type() == MYSQL_TYPE_DECIMAL)
 	  return do_field_string;
@@ -768,14 +793,13 @@ int field_conv(Field *to,Field *from)
   {
     if (to->pack_length() == from->pack_length() &&
         !(to->flags & UNSIGNED_FLAG && !(from->flags & UNSIGNED_FLAG)) &&
+        to->decimals() == from->decimals() &&
 	to->real_type() != MYSQL_TYPE_ENUM &&
 	to->real_type() != MYSQL_TYPE_SET &&
         to->real_type() != MYSQL_TYPE_BIT &&
         (to->real_type() != MYSQL_TYPE_NEWDECIMAL ||
-         ((to->field_length == from->field_length &&
-           (((Field_num*)to)->dec == ((Field_num*)from)->dec)))) &&
+         to->field_length == from->field_length) &&
         from->charset() == to->charset() &&
-	to->table->s->db_low_byte_first == from->table->s->db_low_byte_first &&
         (!(to->table->in_use->variables.sql_mode &
            (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE | MODE_INVALID_DATES)) ||
          (to->type() != MYSQL_TYPE_DATE &&
@@ -784,8 +808,13 @@ int field_conv(Field *to,Field *from)
          ((Field_varstring*)from)->length_bytes ==
           ((Field_varstring*)to)->length_bytes))
     {						// Identical fields
-      // to->ptr==from->ptr may happen if one does 'UPDATE ... SET x=x'
-      memmove(to->ptr, from->ptr, to->pack_length());
+      /*
+        This may happen if one does 'UPDATE ... SET x=x'
+        The test is here mostly for valgrind, but can also be relevant
+        if memcpy() is implemented with prefetch-write
+       */
+      if (to->ptr != from->ptr)
+        memcpy(to->ptr,from->ptr,to->pack_length());
       return 0;
     }
   }
@@ -811,7 +840,22 @@ int field_conv(Field *to,Field *from)
     ((Field_enum *)(to))->store_type(0);
     return 0;
   }
-  else if ((from->result_type() == STRING_RESULT &&
+  if (from->result_type() == REAL_RESULT)
+    return to->store(from->val_real());
+  if (from->result_type() == DECIMAL_RESULT)
+  {
+    my_decimal buff;
+    return to->store_decimal(from->val_decimal(&buff));
+  }
+  if (from->cmp_type() == TIME_RESULT)
+  {
+    MYSQL_TIME ltime;
+    if (from->get_date(&ltime, TIME_FUZZY_DATE))
+      return to->reset();
+    else
+      return to->store_time_dec(&ltime, from->decimals());
+  }
+  if ((from->result_type() == STRING_RESULT &&
             (to->result_type() == STRING_RESULT ||
              (from->real_type() != MYSQL_TYPE_ENUM &&
               from->real_type() != MYSQL_TYPE_SET))) ||
@@ -828,13 +872,5 @@ int field_conv(Field *to,Field *from)
     */
     return to->store(result.c_ptr_quick(),result.length(),from->charset());
   }
-  else if (from->result_type() == REAL_RESULT)
-    return to->store(from->val_real());
-  else if (from->result_type() == DECIMAL_RESULT)
-  {
-    my_decimal buff;
-    return to->store_decimal(from->val_decimal(&buff));
-  }
-  else
-    return to->store(from->val_int(), test(from->flags & UNSIGNED_FLAG));
+  return to->store(from->val_int(), test(from->flags & UNSIGNED_FLAG));
 }
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 19eba8cc84d..13b5d0e2bd5 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -54,10 +54,6 @@ static int write_keys(SORTPARAM *param,uchar * *sort_keys,
 		      uint count, IO_CACHE *buffer_file, IO_CACHE *tempfile);
 static void make_sortkey(SORTPARAM *param,uchar *to, uchar *ref_pos);
 static void register_used_fields(SORTPARAM *param);
-static int merge_index(SORTPARAM *param,uchar *sort_buffer,
-		       BUFFPEK *buffpek,
-		       uint maxbuffer,IO_CACHE *tempfile,
-		       IO_CACHE *outfile);
 static bool save_index(SORTPARAM *param,uchar **sort_keys, uint count, 
                        FILESORT_INFO *table_sort);
 static uint suffix_length(ulong string_length);
@@ -152,8 +148,6 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
   /* filesort cannot handle zero-length records. */
   DBUG_ASSERT(param.sort_length);
   param.ref_length= table->file->ref_length;
-  param.addon_field= 0;
-  param.addon_length= 0;
   if (!(table->file->ha_table_flags() & HA_FAST_KEY_READ) &&
       !table->fulltext_searched && !sort_positions)
   {
@@ -244,7 +238,7 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
     goto err;
   }
   if (open_cached_file(&buffpek_pointers,mysql_tmpdir,TEMP_PREFIX,
-		       DISK_BUFFER_SIZE, MYF(MY_WME)))
+		       DISK_BUFFER_SIZE, MYF(ME_ERROR | MY_WME)))
     goto err;
 
   param.keys--;  			/* TODO: check why we do this */
@@ -279,7 +273,7 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
 	/* Open cached file if it isn't open */
     if (! my_b_inited(outfile) &&
 	open_cached_file(outfile,mysql_tmpdir,TEMP_PREFIX,READ_RECORD_BUFFER,
-			  MYF(MY_WME)))
+			  MYF(ME_ERROR | MY_WME)))
       goto err;
     if (reinit_io_cache(outfile,WRITE_CACHE,0L,0,0))
       goto err;
@@ -335,7 +329,7 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
     DBUG_ASSERT(thd->is_error() || kill_errno);
     my_printf_error(ER_FILSORT_ABORT,
                     "%s: %s",
-                    MYF(ME_ERROR + ME_WAITTANG),
+                    MYF(0),
                     ER_THD(thd, ER_FILSORT_ABORT),
                     kill_errno ? ER(kill_errno) : thd->stmt_da->message());
                     
@@ -560,11 +554,6 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
 		    current_thd->variables.read_buff_size);
   }
 
-  if (quick_select)
-  {
-    if (select->quick->reset())
-      DBUG_RETURN(HA_POS_ERROR);
-  }
 
   /* Remember original bitmaps */
   save_read_set=  sort_form->read_set;
@@ -578,9 +567,19 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
   if (select && select->cond)
     select->cond->walk(&Item::register_field_in_read_map, 1,
                        (uchar*) sort_form);
+  if (select && select->pre_idx_push_select_cond)
+    select->pre_idx_push_select_cond->walk(&Item::register_field_in_read_map,
+                                           1, (uchar*) sort_form);
   sort_form->column_bitmaps_set(&sort_form->tmp_set, &sort_form->tmp_set, 
                                 &sort_form->tmp_set);
 
+
+  if (quick_select)
+  {
+    if (select->quick->reset())
+      DBUG_RETURN(HA_POS_ERROR);
+  }
+
   for (;;)
   {
     if (quick_select)
@@ -630,10 +629,34 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
       }
       DBUG_RETURN(HA_POS_ERROR);		/* purecov: inspected */
     }
+
+    bool write_record= false;
     if (error == 0)
+    {
       param->examined_rows++;
-   
-    if (error == 0 && (!select || select->skip_record(thd) > 0))
+      if (select && select->cond)
+      {
+        /*
+          If the condition 'select->cond' contains a subquery, restore the
+          original read/write sets of the table 'sort_form' because when
+          SQL_SELECT::skip_record evaluates this condition. it may include a
+          correlated subquery predicate, such that some field in the subquery
+          refers to 'sort_form'.
+        */
+        if (select->cond->with_subselect)
+          sort_form->column_bitmaps_set(save_read_set, save_write_set,
+                                        save_vcol_set);
+        write_record= (select->skip_record(thd) > 0);
+        if (select->cond->with_subselect)
+          sort_form->column_bitmaps_set(&sort_form->tmp_set,
+                                        &sort_form->tmp_set,
+                                        &sort_form->tmp_set);
+      }
+      else
+        write_record= true;
+    }
+
+    if (write_record)
     {
       if (idx == param->keys)
       {
@@ -646,7 +669,7 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
     }
     else
       file->unlock_row();
-      
+
     /* It does not make sense to read more keys in case of a fatal error */
     if (thd->is_error())
       break;
@@ -868,25 +891,28 @@ static void make_sortkey(register SORTPARAM *param,
         break;
       }
       case INT_RESULT:
+      case TIME_RESULT:
 	{
-          longlong value= item->val_int_result();
+          longlong UNINIT_VAR(value);
+          if (sort_field->result_type == INT_RESULT)
+            value= item->val_int_result();
+          else
+          {
+            MYSQL_TIME buf;
+            if (item->get_date_result(&buf, TIME_FUZZY_DATE | TIME_INVALID_DATES))
+              DBUG_ASSERT(maybe_null && item->null_value);
+            else
+              value= pack_time(&buf);
+          }
           if (maybe_null)
           {
-	    *to++=1;				/* purecov: inspected */
             if (item->null_value)
             {
-              if (maybe_null)
-                bzero((char*) to-1,sort_field->length+1);
-              else
-              {
-                DBUG_PRINT("warning",
-                           ("Got null on something that shouldn't be null"));
-                bzero((char*) to,sort_field->length);
-              }
+              bzero((char*) to++, sort_field->length+1);
               break;
             }
+	    *to++=1;				/* purecov: inspected */
           }
-#if SIZEOF_LONG_LONG > 4
 	  to[7]= (uchar) value;
 	  to[6]= (uchar) (value >> 8);
 	  to[5]= (uchar) (value >> 16);
@@ -898,15 +924,6 @@ static void make_sortkey(register SORTPARAM *param,
             to[0]= (uchar) (value >> 56);
           else
             to[0]= (uchar) (value >> 56) ^ 128;	/* Reverse signbit */
-#else
-	  to[3]= (uchar) value;
-	  to[2]= (uchar) (value >> 8);
-	  to[1]= (uchar) (value >> 16);
-          if (item->unsigned_flag)                    /* Fix sign */
-            to[0]= (uchar) (value >> 24);
-          else
-            to[0]= (uchar) (value >> 24) ^ 128;	/* Reverse signbit */
-#endif
 	  break;
 	}
       case DECIMAL_RESULT:
@@ -916,8 +933,7 @@ static void make_sortkey(register SORTPARAM *param,
           {
             if (item->null_value)
             { 
-              bzero((char*)to, sort_field->length+1);
-              to++;
+              bzero((char*) to++, sort_field->length+1);
               break;
             }
             *to++=1;
@@ -1234,8 +1250,11 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   QUEUE queue;
   qsort2_cmp cmp;
   void *first_cmp_arg;
-  volatile THD::killed_state *killed= &current_thd->killed;
+  element_count dupl_count= 0;
+  uchar *src;
   THD::killed_state not_killable;
+  uchar *unique_buff= param->unique_buff;
+  volatile THD::killed_state *killed= &current_thd->killed;
   DBUG_ENTER("merge_buffers");
 
   status_var_increment(current_thd->status_var.filesort_merge_passes);
@@ -1250,7 +1269,13 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   rec_length= param->rec_length;
   res_length= param->res_length;
   sort_length= param->sort_length;
-  offset= rec_length-res_length;
+  uint dupl_count_ofs= rec_length-sizeof(element_count);
+  uint min_dupl_count= param->min_dupl_count;
+  bool check_dupl_count= flag && min_dupl_count;
+  offset= (rec_length-
+           (flag && min_dupl_count ? sizeof(dupl_count) : 0)-res_length);
+  uint wr_len= flag ? res_length : rec_length;
+  uint wr_offset= flag ? offset : 0;
   maxcount= (ulong) (param->keys/((uint) (Tb-Fb) +1));
   to_start_filepos= my_b_tell(to_file);
   strpos= sort_buffer;
@@ -1259,7 +1284,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   /* The following will fire if there is not enough space in sort_buffer */
   DBUG_ASSERT(maxcount!=0);
   
-  if (param->unique_buff)
+  if (unique_buff)
   {
     cmp= param->compare;
     first_cmp_arg= (void *) &param->cmp_context;
@@ -1284,28 +1309,29 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     queue_insert(&queue, (uchar*) buffpek);
   }
 
-  if (param->unique_buff)
+  if (unique_buff)
   {
     /* 
        Called by Unique::get()
-       Copy the first argument to param->unique_buff for unique removal.
+       Copy the first argument to unique_buff for unique removal.
        Store it also in 'to_file'.
-
-       This is safe as we know that there is always more than one element
-       in each block to merge (This is guaranteed by the Unique:: algorithm
     */
     buffpek= (BUFFPEK*) queue_top(&queue);
-    memcpy(param->unique_buff, buffpek->key, rec_length);
-    if (my_b_write(to_file, (uchar*) buffpek->key, rec_length))
-    {
-      error=1; goto err;                        /* purecov: inspected */
-    }
+    memcpy(unique_buff, buffpek->key, rec_length);
+    if (min_dupl_count)
+      memcpy(&dupl_count, unique_buff+dupl_count_ofs, 
+             sizeof(dupl_count));
     buffpek->key+= rec_length;
-    buffpek->mem_count--;
-    if (!--max_rows)
+    if (! --buffpek->mem_count)
     {
-      error= 0;                                       /* purecov: inspected */
-      goto end;                                       /* purecov: inspected */
+      if (!(error= (int) read_to_buffer(from_file, buffpek,
+                                        rec_length)))
+      {
+        queue_remove(&queue,0);
+        reuse_freed_buff(&queue, buffpek, rec_length);
+      }
+      else if (error == -1)
+        goto err;                        /* purecov: inspected */ 
     }
     queue_replace_top(&queue);            // Top element has been used
   }
@@ -1321,27 +1347,50 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     for (;;)
     {
       buffpek= (BUFFPEK*) queue_top(&queue);
+      src= buffpek->key;
       if (cmp)                                        // Remove duplicates
       {
-        if (!(*cmp)(first_cmp_arg, &(param->unique_buff),
+        if (!(*cmp)(first_cmp_arg, &unique_buff,
                     (uchar**) &buffpek->key))
-              goto skip_duplicate;
-            memcpy(param->unique_buff, (uchar*) buffpek->key, rec_length);
-      }
-      if (flag == 0)
-      {
-        if (my_b_write(to_file,(uchar*) buffpek->key, rec_length))
-        {
-          error=1; goto err;                        /* purecov: inspected */
+	{
+          if (min_dupl_count)
+	  {
+            element_count cnt;
+            memcpy(&cnt, (uchar *) buffpek->key+dupl_count_ofs, sizeof(cnt));
+            dupl_count+= cnt;
+          }
+          goto skip_duplicate;
         }
+        if (min_dupl_count)
+	{
+          memcpy(unique_buff+dupl_count_ofs, &dupl_count,
+                 sizeof(dupl_count));
+        }
+	src= unique_buff;
       }
-      else
+        
+      /* 
+        Do not write into the output file if this is the final merge called
+        for a Unique object used for intersection and dupl_count is less
+        than min_dupl_count.
+        If the Unique object is used to intersect N sets of unique elements
+        then for any element:
+        dupl_count >= N <=> the element is occurred in each of these N sets.
+      */          
+      if (!check_dupl_count || dupl_count >= min_dupl_count)
       {
-        if (my_b_write(to_file, (uchar*) buffpek->key+offset, res_length))
+        if (my_b_write(to_file, src+wr_offset, wr_len))
         {
           error=1; goto err;                        /* purecov: inspected */
         }
       }
+      if (cmp)
+      {   
+        memcpy(unique_buff, (uchar*) buffpek->key, rec_length);
+        if (min_dupl_count)
+          memcpy(&dupl_count, unique_buff+dupl_count_ofs, 
+                 sizeof(dupl_count));
+      }
       if (!--max_rows)
       {
         error= 0;                               /* purecov: inspected */
@@ -1352,7 +1401,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
       buffpek->key+= rec_length;
       if (! --buffpek->mem_count)
       {
-        if (!(error= (int) read_to_buffer(from_file,buffpek,
+        if (!(error= (int) read_to_buffer(from_file, buffpek,
                                           rec_length)))
         {
           (void) queue_remove_top(&queue);
@@ -1375,11 +1424,35 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
   */
   if (cmp)
   {
-    if (!(*cmp)(first_cmp_arg, &(param->unique_buff), (uchar**) &buffpek->key))
+    if (!(*cmp)(first_cmp_arg, &unique_buff, (uchar**) &buffpek->key))
     {
-      buffpek->key+= rec_length;         // Remove duplicate
+      if (min_dupl_count)
+      {
+        element_count cnt;
+        memcpy(&cnt, (uchar *) buffpek->key+dupl_count_ofs, sizeof(cnt));
+        dupl_count+= cnt;
+      }
+      buffpek->key+= rec_length;         
       --buffpek->mem_count;
     }
+
+    if (min_dupl_count)
+      memcpy(unique_buff+dupl_count_ofs, &dupl_count,
+             sizeof(dupl_count));
+
+    if (!check_dupl_count || dupl_count >= min_dupl_count)
+    {
+      src= unique_buff;
+      if (my_b_write(to_file, src+wr_offset, wr_len))
+      {
+        error=1; goto err;                        /* purecov: inspected */
+      }
+      if (!--max_rows)
+      {
+        error= 0;                               
+        goto end;                             
+      }
+    }   
   }
 
   do
@@ -1392,7 +1465,7 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     max_rows-= buffpek->mem_count;
     if (flag == 0)
     {
-      if (my_b_write(to_file,(uchar*) buffpek->key,
+      if (my_b_write(to_file, (uchar*) buffpek->key,
                      (rec_length*buffpek->mem_count)))
       {
         error= 1; goto err;                        /* purecov: inspected */
@@ -1401,19 +1474,25 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
     else
     {
       register uchar *end;
-      strpos= buffpek->key+offset;
-      for (end= strpos+buffpek->mem_count*rec_length ;
-           strpos != end ;
-           strpos+= rec_length)
-      {     
-        if (my_b_write(to_file, strpos, res_length))
+      src= buffpek->key+offset;
+      for (end= src+buffpek->mem_count*rec_length ;
+           src != end ;
+           src+= rec_length)
+      {
+        if (check_dupl_count)
+        {
+          memcpy((uchar *) &dupl_count, src+dupl_count_ofs, sizeof(dupl_count)); 
+          if (dupl_count < min_dupl_count)
+	    continue;
+        }
+        if (my_b_write(to_file, src, wr_len))
         {
           error=1; goto err;                        
         }
       }
     }
   }
-  while ((error=(int) read_to_buffer(from_file,buffpek, rec_length))
+  while ((error=(int) read_to_buffer(from_file, buffpek, rec_length))
          != -1 && error != 0);
 
 end:
@@ -1427,7 +1506,7 @@ err:
 
 	/* Do a merge to output-file (save only positions) */
 
-static int merge_index(SORTPARAM *param, uchar *sort_buffer,
+int merge_index(SORTPARAM *param, uchar *sort_buffer,
 		       BUFFPEK *buffpek, uint maxbuffer,
 		       IO_CACHE *tempfile, IO_CACHE *outfile)
 {
@@ -1499,9 +1578,7 @@ sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length,
     }
     else
     {
-      sortorder->result_type= sortorder->item->result_type();
-      if (sortorder->item->result_as_longlong())
-        sortorder->result_type= INT_RESULT;
+      sortorder->result_type= sortorder->item->cmp_type();
       switch (sortorder->result_type) {
       case STRING_RESULT:
 	sortorder->length=sortorder->item->max_length;
@@ -1519,12 +1596,9 @@ sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length,
           sortorder->length+= sortorder->suffix_length;
         }
 	break;
+      case TIME_RESULT:
       case INT_RESULT:
-#if SIZEOF_LONG_LONG > 4
 	sortorder->length=8;			// Size of intern longlong
-#else
-	sortorder->length=4;
-#endif
 	break;
       case DECIMAL_RESULT:
         sortorder->length=
@@ -1599,6 +1673,7 @@ get_addon_fields(THD *thd, Field **ptabfield, uint sortlength, uint *plength)
     Actually we need only the fields referred in the
     result set. And for some of them it makes sense to use 
     the values directly from sorted fields.
+    But beware the case when item->cmp_type() != item->result_type()
   */
   *plength= 0;
 
@@ -1733,3 +1808,4 @@ void change_double_for_sort(double nr,uchar *to)
     }
   }
 }
+
diff --git a/sql/filesort.h b/sql/filesort.h
index c1a101cc1e8..a4056415d32 100644
--- a/sql/filesort.h
+++ b/sql/filesort.h
@@ -31,6 +31,8 @@ ha_rows filesort(THD *thd, TABLE *table, st_sort_field *sortorder,
                  ha_rows max_rows, bool sort_positions,
                  ha_rows *examined_rows);
 void filesort_free_buffers(TABLE *table, bool full);
+double get_merge_many_buffs_cost(uint *buffer, uint last_n_elems,
+                                 int elem_size);
 void change_double_for_sort(double nr,uchar *to);
 
 #endif /* FILESORT_INCLUDED */
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 512152f1f57..c5540976f93 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -166,6 +166,7 @@ ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
   :handler(hton, share)
 {
   DBUG_ENTER("ha_partition::ha_partition(table)");
+  init_alloc_root(&m_mem_root, 512, 512);
   init_handler_variables();
   DBUG_VOID_RETURN;
 }
@@ -187,6 +188,7 @@ ha_partition::ha_partition(handlerton *hton, partition_info *part_info)
 {
   DBUG_ENTER("ha_partition::ha_partition(part_info)");
   DBUG_ASSERT(part_info);
+  init_alloc_root(&m_mem_root, 512, 512);
   init_handler_variables();
   m_part_info= part_info;
   m_create_handler= TRUE;
@@ -213,6 +215,7 @@ ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share,
   :handler(hton, share)
 {
   DBUG_ENTER("ha_partition::ha_partition(clone)");
+  init_alloc_root(&m_mem_root, 512, 512);
   init_handler_variables();
   m_part_info= part_info_arg;
   m_create_handler= TRUE;
@@ -240,6 +243,7 @@ void ha_partition::init_handler_variables()
   m_file_buffer= NULL;
   m_name_buffer_ptr= NULL;
   m_engine_array= NULL;
+  m_connect_string= NULL;
   m_file= NULL;
   m_file_tot_parts= 0;
   m_reorged_file= NULL;
@@ -263,7 +267,6 @@ void ha_partition::init_handler_variables()
   m_extra_prepare_for_update= FALSE;
   m_extra_cache_part_id= NO_CURRENT_PART_ID;
   m_handler_status= handler_not_initialized;
-  m_low_byte_first= 1;
   m_part_field_array= NULL;
   m_ordered_rec_buffer= NULL;
   m_top_entry= NO_CURRENT_PART_ID;
@@ -319,8 +322,12 @@ ha_partition::~ha_partition()
       delete m_file[i];
   }
   my_free(m_ordered_rec_buffer);
+  m_ordered_rec_buffer= NULL;
 
   clear_handler_file();
+
+  free_root(&m_mem_root, MYF(0));
+
   DBUG_VOID_RETURN;
 }
 
@@ -365,7 +372,7 @@ ha_partition::~ha_partition()
      The flag HA_READ_ORDER will be reset for the time being to indicate no
      ordered output is available from partition handler indexes. Later a merge
      sort will be performed using the underlying handlers.
-  5) primary_key_is_clustered, has_transactions and low_byte_first is
+  5) primary_key_is_clustered and has_transactions are
      calculated here.
 
 */
@@ -401,24 +408,17 @@ bool ha_partition::initialize_partition(MEM_ROOT *mem_root)
     We create all underlying table handlers here. We do it in this special
     method to be able to report allocation errors.
 
-    Set up low_byte_first, primary_key_is_clustered and
+    Set up primary_key_is_clustered and
     has_transactions since they are called often in all kinds of places,
     other parameters are calculated on demand.
     Verify that all partitions have the same table_flags.
   */
   check_table_flags= m_file[0]->ha_table_flags();
-  m_low_byte_first= m_file[0]->low_byte_first();
   m_pkey_is_clustered= TRUE;
   file_array= m_file;
   do
   {
     file= *file_array;
-    if (m_low_byte_first != file->low_byte_first())
-    {
-      // Cannot have handlers with different endian
-      my_error(ER_MIX_HANDLER_ERROR, MYF(0));
-      DBUG_RETURN(1);
-    }
     if (!file->primary_key_is_clustered())
       m_pkey_is_clustered= FALSE;
     if (check_table_flags != file->ha_table_flags())
@@ -588,6 +588,13 @@ int ha_partition::create(const char *name, TABLE *table_arg,
   char t_name[FN_REFLEN];
   DBUG_ENTER("ha_partition::create");
 
+  if (create_info->used_fields & HA_CREATE_USED_CONNECTION)
+  {
+    my_error(ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0),
+             "CONNECTION not valid for partition");
+    DBUG_RETURN(1);
+  }
+
   strmov(t_name, name);
   DBUG_ASSERT(*fn_rext((char*)name) == '\0');
   if (del_ren_cre_table(t_name, NULL, table_arg, create_info))
@@ -1172,7 +1179,8 @@ int ha_partition::handle_opt_partitions(THD *thd, HA_CHECK_OPT *check_opt,
                 error != HA_ADMIN_ALREADY_DONE &&
                 error != HA_ADMIN_TRY_ALTER)
             {
-              print_admin_msg(thd, "error", table_share->db.str, table->alias,
+              print_admin_msg(thd, "error", table_share->db.str,
+                              table->alias.c_ptr(),
                               opt_op_name[flag],
                               "Subpartition %s returned error", 
                               sub_elem->partition_name);
@@ -1198,7 +1206,8 @@ int ha_partition::handle_opt_partitions(THD *thd, HA_CHECK_OPT *check_opt,
               error != HA_ADMIN_ALREADY_DONE &&
               error != HA_ADMIN_TRY_ALTER)
           {
-            print_admin_msg(thd, "error", table_share->db.str, table->alias,
+            print_admin_msg(thd, "error", table_share->db.str,
+                            table->alias.c_ptr(),
                             opt_op_name[flag], "Partition %s returned error", 
                             part_elem->partition_name);
           }
@@ -1307,6 +1316,7 @@ int ha_partition::prepare_new_partition(TABLE *tbl,
   if ((error= set_up_table_before_create(tbl, part_name, create_info,
                                          0, p_elem)))
     goto error_create;
+  tbl->s->connect_string = p_elem->connect_string;
   if ((error= file->ha_create(part_name, tbl, create_info)))
   {
     /*
@@ -1336,7 +1346,7 @@ int ha_partition::prepare_new_partition(TABLE *tbl,
 
   DBUG_RETURN(0);
 error_external_lock:
-  (void) file->close();
+  (void) file->ha_close();
 error_open:
   (void) file->ha_delete_table(part_name);
 error_create:
@@ -1382,7 +1392,7 @@ void ha_partition::cleanup_new_partition(uint part_count)
     while ((part_count > 0) && (*file))
     {
       (*file)->ha_external_lock(thd, F_UNLCK);
-      (*file)->close();
+      (*file)->ha_close();
 
       /* Leave the (*file)->ha_delete_table(part_name) to the ddl-log */
 
@@ -1827,6 +1837,8 @@ void ha_partition::update_create_info(HA_CREATE_INFO *create_info)
     create_info->auto_increment_value= stats.auto_increment_value;
 
   create_info->data_file_name= create_info->index_file_name = NULL;
+  create_info->connect_string.str= NULL;
+  create_info->connect_string.length= 0;
   return;
 }
 
@@ -2115,6 +2127,10 @@ int ha_partition::set_up_table_before_create(TABLE *tbl,
   }
   info->index_file_name= part_elem->index_file_name;
   info->data_file_name= part_elem->data_file_name;
+  info->connect_string= part_elem->connect_string;
+  if (info->connect_string.length)
+    info->used_fields|= HA_CREATE_USED_CONNECTION;
+  tbl->s->connect_string= part_elem->connect_string;
   DBUG_RETURN(0);
 }
 
@@ -2229,8 +2245,10 @@ bool ha_partition::create_handler_file(const char *name)
   /* 4 static words (tot words, checksum, tot partitions, name length) */
   tot_len_words= 4 + tot_partition_words + tot_name_words;
   tot_len_byte= PAR_WORD_SIZE * tot_len_words;
-  if (!(file_buffer= (uchar *) my_malloc(tot_len_byte, MYF(MY_ZEROFILL))))
+  file_buffer= (uchar *) my_alloca(tot_len_byte);
+  if (!file_buffer)
     DBUG_RETURN(TRUE);
+  bzero(file_buffer, tot_len_byte);
   engine_array= (file_buffer + PAR_ENGINES_OFFSET);
   name_buffer_ptr= (char*) (engine_array + tot_partition_words * PAR_WORD_SIZE
                             + PAR_WORD_SIZE);
@@ -2290,11 +2308,28 @@ bool ha_partition::create_handler_file(const char *name)
   {
     result= mysql_file_write(file, (uchar *) file_buffer, tot_len_byte,
                              MYF(MY_WME | MY_NABP)) != 0;
+
+    /* Write connection information (for federatedx engine) */
+    part_it.rewind();
+    for (i= 0; i < num_parts && !result; i++)
+    {
+      uchar buffer[4];
+      part_elem= part_it++;
+      uint length = part_elem->connect_string.length;
+      int4store(buffer, length);
+      if (my_write(file, buffer, 4, MYF(MY_WME | MY_NABP)) ||
+          my_write(file, (uchar *) part_elem->connect_string.str, length,
+                   MYF(MY_WME | MY_NABP)))
+      {
+        result= TRUE;
+        break;
+      }
+    }
     (void) mysql_file_close(file, MYF(0));
   }
   else
     result= TRUE;
-  my_free(file_buffer);
+  my_afree((char*) file_buffer);
   DBUG_RETURN(result);
 }
 
@@ -2307,10 +2342,10 @@ void ha_partition::clear_handler_file()
 {
   if (m_engine_array)
     plugin_unlock_list(NULL, m_engine_array, m_tot_parts);
-  my_free(m_file_buffer);
-  my_free(m_engine_array);
+  free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
   m_file_buffer= NULL;
   m_engine_array= NULL;
+  m_connect_string= NULL;
 }
 
 
@@ -2467,7 +2502,7 @@ bool ha_partition::read_par_file(const char *name)
   len_bytes= PAR_WORD_SIZE * len_words;
   if (mysql_file_seek(file, 0, MY_SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR)
     goto err1;
-  if (!(file_buffer= (char*) my_malloc(len_bytes, MYF(0))))
+  if (!(file_buffer= (char*) alloc_root(&m_mem_root, len_bytes)))
     goto err1;
   if (mysql_file_read(file, (uchar *) file_buffer, len_bytes, MYF(MY_NABP)))
     goto err2;
@@ -2491,14 +2526,37 @@ bool ha_partition::read_par_file(const char *name)
   */
   if (len_words != (tot_partition_words + tot_name_words + 4))
     goto err2;
-  (void) mysql_file_close(file, MYF(0));
   m_file_buffer= file_buffer;          // Will be freed in clear_handler_file()
   m_name_buffer_ptr= tot_name_len_offset + PAR_WORD_SIZE;
 
+  if (!(m_connect_string= (LEX_STRING*)
+        alloc_root(&m_mem_root, m_tot_parts * sizeof(LEX_STRING))))
+    goto err2;
+  bzero(m_connect_string, m_tot_parts * sizeof(LEX_STRING));
+
+  /* Read connection arguments (for federated X engine) */
+  for (i= 0; i < m_tot_parts; i++)
+  {
+    LEX_STRING connect_string;
+    uchar buffer[4];
+    if (my_read(file, buffer, 4, MYF(MY_NABP)))
+    {
+      /* No extra options; Probably not a federatedx engine */
+      break;
+    }
+    connect_string.length= uint4korr(buffer);
+    connect_string.str= (char*) alloc_root(&m_mem_root, connect_string.length+1);
+    if (my_read(file, (uchar*) connect_string.str, connect_string.length,
+                MYF(MY_NABP)))
+      break;
+    connect_string.str[connect_string.length]= 0;
+    m_connect_string[i]= connect_string;
+  }
+
+  (void) mysql_file_close(file, MYF(0));
   DBUG_RETURN(false);
 
 err2:
-  my_free(file_buffer);
 err1:
   (void) mysql_file_close(file, MYF(0));
   DBUG_RETURN(true);
@@ -2537,13 +2595,13 @@ bool ha_partition::setup_engine_array(MEM_ROOT *mem_root)
       goto err;
   }
   if (!(m_engine_array= (plugin_ref*)
-                my_malloc(m_tot_parts * sizeof(plugin_ref), MYF(MY_WME))))
+        alloc_root(&m_mem_root, m_tot_parts * sizeof(plugin_ref))))
     goto err;
 
   for (i= 0; i < m_tot_parts; i++)
     m_engine_array[i]= ha_lock_engine(NULL, engine_array[i]);
 
-  my_afree((gptr) engine_array);
+  my_afree(engine_array);
     
   if (create_handlers(mem_root))
   {
@@ -2554,7 +2612,7 @@ bool ha_partition::setup_engine_array(MEM_ROOT *mem_root)
   DBUG_RETURN(false);
 
 err:
-  my_afree((gptr) engine_array);
+  my_afree(engine_array);
   DBUG_RETURN(true);
 }
 
@@ -2732,8 +2790,10 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
    {
       create_partition_name(name_buff, name, name_buffer_ptr, NORMAL_PART_NAME,
                             FALSE);
+      table->s->connect_string = m_connect_string[(uint)(file-m_file)];
       if ((error= (*file)->ha_open(table, name_buff, mode, test_if_locked)))
         goto err_handler;
+      bzero(&table->s->connect_string, sizeof(LEX_STRING));
       m_num_locks+= (*file)->lock_count();
       name_buffer_ptr+= strlen(name_buffer_ptr) + 1;
     } while (*(++file));
@@ -2829,7 +2889,7 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
 err_handler:
   DEBUG_SYNC(ha_thd(), "partition_open_error");
   while (file-- != m_file)
-    (*file)->close();
+    (*file)->ha_close();
 err_alloc:
   bitmap_free(&m_bulk_insert_started);
   if (!m_is_clone_of)
@@ -2915,7 +2975,7 @@ int ha_partition::close(void)
 repeat:
   do
   {
-    (*file)->close();
+    (*file)->ha_close();
   } while (*(++file));
 
   if (first && m_added_file && m_added_file[0])
@@ -4236,6 +4296,7 @@ int ha_partition::index_init(uint inx, bool sorted)
   m_part_spec.start_part= NO_CURRENT_PART_ID;
   m_start_key.length= 0;
   m_ordered= sorted;
+  m_ordered_scan_ongoing= FALSE;
   m_curr_key_info[0]= table->key_info+inx;
   if (m_pkey_is_clustered && table->s->primary_key != MAX_KEY)
   {
@@ -4974,19 +5035,6 @@ int ha_partition::handle_unordered_scan_next_partition(uchar * buf)
       break;
     case partition_index_first:
       DBUG_PRINT("info", ("index_first on partition %d", i));
-      /*
-        MyISAM engine can fail if we call index_first() when indexes disabled
-        that happens if the table is empty.
-        Here we use file->stats.records instead of file->records() because
-        file->records() is supposed to return an EXACT count, and it can be
-        possibly slow. We don't need an exact number, an approximate one- from
-        the last ::info() call - is sufficient.
-      */
-      if (file->stats.records == 0)
-      {
-        error= HA_ERR_END_OF_FILE;
-        break;
-      }
       error= file->ha_index_first(buf);
       break;
     case partition_index_first_unordered:
@@ -5066,6 +5114,12 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
     int error;
     handler *file= m_file[i];
 
+    /*
+      Reset null bits (to avoid valgrind warnings) and to give a default
+      value for not read null fields.
+    */
+    bfill(rec_buf_ptr, table->s->null_bytes, 255);
+
     switch (m_index_scan_type) {
     case partition_index_read:
       error= file->ha_index_read_map(rec_buf_ptr,
@@ -5074,36 +5128,10 @@ int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
                                      m_start_key.flag);
       break;
     case partition_index_first:
-      /*
-        MyISAM engine can fail if we call index_first() when indexes disabled
-        that happens if the table is empty.
-        Here we use file->stats.records instead of file->records() because
-        file->records() is supposed to return an EXACT count, and it can be
-        possibly slow. We don't need an exact number, an approximate one- from
-        the last ::info() call - is sufficient.
-      */
-      if (file->stats.records == 0)
-      {
-        error= HA_ERR_END_OF_FILE;
-        break;
-      }
       error= file->ha_index_first(rec_buf_ptr);
       reverse_order= FALSE;
       break;
     case partition_index_last:
-      /*
-        MyISAM engine can fail if we call index_last() when indexes disabled
-        that happens if the table is empty.
-        Here we use file->stats.records instead of file->records() because
-        file->records() is supposed to return an EXACT count, and it can be
-        possibly slow. We don't need an exact number, an approximate one- from
-        the last ::info() call - is sufficient.
-      */
-      if (file->stats.records == 0)
-      {
-        error= HA_ERR_END_OF_FILE;
-        break;
-      }
       error= file->ha_index_last(rec_buf_ptr);
       reverse_order= TRUE;
       break;
@@ -5934,6 +5962,7 @@ int ha_partition::extra(enum ha_extra_function operation)
   /* Category 3), used by MyISAM handlers */
   case HA_EXTRA_PREPARE_FOR_RENAME:
     DBUG_RETURN(prepare_for_rename());
+    break;
   case HA_EXTRA_PREPARE_FOR_UPDATE:
     /*
       Needs to be run on the first partition in the range now, and 
diff --git a/sql/ha_partition.h b/sql/ha_partition.h
index c2bdd23a256..45ac8168513 100644
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -63,12 +63,14 @@ private:
   uint m_open_test_lock;                // Open test_if_locked
   char *m_file_buffer;                  // Content of the .par file 
   char *m_name_buffer_ptr;		// Pointer to first partition name
+  MEM_ROOT m_mem_root;
   plugin_ref *m_engine_array;           // Array of types of the handlers
   handler **m_file;                     // Array of references to handler inst.
   uint m_file_tot_parts;                // Debug
   handler **m_new_file;                 // Array of references to new handlers
   handler **m_reorged_file;             // Reorganised partitions
   handler **m_added_file;               // Added parts kept for errors
+  LEX_STRING *m_connect_string;
   partition_info *m_part_info;          // local reference to partition
   Field **m_part_field_array;           // Part field array locally to save acc
   uchar *m_ordered_rec_buffer;          // Row and key buffer for ord. idx scan
@@ -89,7 +91,6 @@ private:
     for this since the MySQL Server sometimes allocating the handler object
     without freeing them.
   */
-  ulong m_low_byte_first;
   enum enum_handler_status
   {
     handler_not_initialized= 0,
@@ -247,7 +248,6 @@ public:
     DBUG_RETURN(0);
   }
   virtual void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share);
-  bool check_if_supported_virtual_columns(void) { return TRUE;}
   virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info,
                                           uint table_changes);
 private:
@@ -864,6 +864,10 @@ public:
   */
   virtual ulong index_flags(uint inx, uint part, bool all_parts) const
   {
+    /*
+      The following code is not safe if you are using different
+      storage engines or different index types per partition.
+    */
     return m_file[0]->index_flags(inx, part, all_parts);
   }
 
@@ -890,12 +894,6 @@ public:
   virtual uint max_supported_key_part_length() const;
 
   /*
-    All handlers in a partitioned table must have the same low_byte_first
-  */
-  virtual bool low_byte_first() const
-  { return m_low_byte_first; }
-
-  /*
     The extra record buffer length is the maximum needed by all handlers.
     The minimum record length is the maximum of all involved handlers.
   */
diff --git a/sql/handler.cc b/sql/handler.cc
index 8e0812f3528..1aeb818b65d 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1,5 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
-   Copyright (c) 2011 Monty Program Ab
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -98,6 +98,9 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
 uint known_extensions_id= 0;
 
+static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
+                              bool is_real_trans);
+
 static plugin_ref ha_default_plugin(THD *thd)
 {
   if (thd->variables.table_plugin)
@@ -359,6 +362,7 @@ int ha_init_errors(void)
   SETMSG(HA_ERR_AUTOINC_ERANGE,         ER_DEFAULT(ER_WARN_DATA_OUT_OF_RANGE));
   SETMSG(HA_ERR_TOO_MANY_CONCURRENT_TRXS, ER_DEFAULT(ER_TOO_MANY_CONCURRENT_TRXS));
   SETMSG(HA_ERR_INDEX_COL_TOO_LONG,	ER_DEFAULT(ER_INDEX_COLUMN_TOO_LONG));
+  SETMSG(HA_ERR_DISK_FULL,              ER_DEFAULT(ER_DISK_FULL));
 
   /* Register the error messages for use with my_error(). */
   return my_error_register(get_handler_errmsgs, HA_ERR_FIRST, HA_ERR_LAST);
@@ -626,6 +630,23 @@ void ha_drop_database(char* path)
 }
 
 
+static my_bool checkpoint_state_handlerton(THD *unused1, plugin_ref plugin,
+                                           void *disable)
+{
+  handlerton *hton= plugin_data(plugin, handlerton *);
+  if (hton->state == SHOW_OPTION_YES && hton->checkpoint_state)
+    hton->checkpoint_state(hton, (int) *(bool*) disable);
+  return FALSE;
+}
+
+
+void ha_checkpoint_state(bool disable)
+{
+  plugin_foreach(NULL, checkpoint_state_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &disable);
+}
+
+
+
 static my_bool closecon_handlerton(THD *thd, plugin_ref plugin,
                                    void *unused)
 {
@@ -1107,7 +1128,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
 */
 int ha_commit_trans(THD *thd, bool all)
 {
-  int error= 0, cookie= 0;
+  int error= 0, cookie;
   /*
     'all' means that this is either an explicit commit issued by
     user, or an implicit commit issued by a DDL.
@@ -1122,7 +1143,8 @@ int ha_commit_trans(THD *thd, bool all)
   */
   bool is_real_trans= all || thd->transaction.all.ha_list == 0;
   Ha_trx_info *ha_info= trans->ha_list;
-  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
+  bool need_prepare_ordered, need_commit_ordered;
+  my_xid xid;
   DBUG_ENTER("ha_commit_trans");
 
   /* Just a random warning to test warnings pushed during autocommit. */
@@ -1165,115 +1187,138 @@ int ha_commit_trans(THD *thd, bool all)
     ha_maria::implicit_commit(thd, FALSE);
 #endif
 
-  if (ha_info)
+  if (!ha_info)
   {
-    uint rw_ha_count;
-    bool rw_trans;
-    MDL_request mdl_request;
-
-    DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
-
-    /* Close all cursors that can not survive COMMIT */
-    if (is_real_trans)                          /* not a statement commit */
-      thd->stmt_map.close_transient_cursors();
+    /* Free resources and perform other cleanup even for 'empty' transactions. */
+    if (is_real_trans)
+      thd->transaction.cleanup();
+    DBUG_RETURN(0);
+  }
 
-    rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
-    /* rw_trans is TRUE when we in a transaction changing data */
-    rw_trans= is_real_trans && (rw_ha_count > 0);
+  DBUG_EXECUTE_IF("crash_commit_before", DBUG_SUICIDE(););
 
-    if (rw_trans)
-    {
-      /*
-        Acquire a metadata lock which will ensure that COMMIT is blocked
-        by an active FLUSH TABLES WITH READ LOCK (and vice versa:
-        COMMIT in progress blocks FTWRL).
+  /* Close all cursors that can not survive COMMIT */
+  if (is_real_trans)                          /* not a statement commit */
+    thd->stmt_map.close_transient_cursors();
 
-        We allow the owner of FTWRL to COMMIT; we assume that it knows
-        what it does.
-      */
-      mdl_request.init(MDL_key::COMMIT, "", "", MDL_INTENTION_EXCLUSIVE,
-                       MDL_EXPLICIT);
+  uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
+  /* rw_trans is TRUE when we in a transaction changing data */
+  bool rw_trans= is_real_trans && (rw_ha_count > 0);
+  MDL_request mdl_request;
 
-      if (thd->mdl_context.acquire_lock(&mdl_request,
-                                        thd->variables.lock_wait_timeout))
-      {
-        ha_rollback_trans(thd, all);
-        DBUG_RETURN(1);
-      }
+  if (rw_trans)
+  {
+    /*
+      Acquire a metadata lock which will ensure that COMMIT is blocked
+      by an active FLUSH TABLES WITH READ LOCK (and vice versa:
+      COMMIT in progress blocks FTWRL).
 
-      DEBUG_SYNC(thd, "ha_commit_trans_after_acquire_commit_lock");
-    }
+      We allow the owner of FTWRL to COMMIT; we assume that it knows
+      what it does.
+    */
+    mdl_request.init(MDL_key::COMMIT, "", "", MDL_INTENTION_EXCLUSIVE,
+                     MDL_EXPLICIT);
 
-    if (rw_trans &&
-        opt_readonly &&
-        !(thd->security_ctx->master_access & SUPER_ACL) &&
-        !thd->slave_thread)
+    if (thd->mdl_context.acquire_lock(&mdl_request,
+                                      thd->variables.lock_wait_timeout))
     {
-      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
       ha_rollback_trans(thd, all);
-      error= 1;
-      goto end;
+      DBUG_RETURN(1);
     }
 
-    if (!trans->no_2pc && (rw_ha_count > 1))
-    {
-      for (; ha_info && !error; ha_info= ha_info->next())
-      {
-        int err;
-        handlerton *ht= ha_info->ht();
-        /*
-          Do not call two-phase commit if this particular
-          transaction is read-only. This allows for simpler
-          implementation in engines that are always read-only.
-        */
-        if (! ha_info->is_trx_read_write())
-          continue;
-        /*
-          Sic: we know that prepare() is not NULL since otherwise
-          trans->no_2pc would have been set.
-        */
-        if ((err= ht->prepare(ht, thd, all)))
-        {
-          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
-          error= 1;
-        }
-        status_var_increment(thd->status_var.ha_prepare_count);
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
-      if (error || (is_real_trans && xid &&
-                    (error= !(cookie= tc_log->log_xid(thd, xid)))))
-      {
-        ha_rollback_trans(thd, all);
-        error= 1;
-        goto end;
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
-    }
-    error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
-    DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
-    if (cookie)
-      if(tc_log->unlog(cookie, xid))
-      {
-        error= 2;
-        goto end;
-      }
-    DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
-    RUN_HOOK(transaction, after_commit, (thd, FALSE));
+    DEBUG_SYNC(thd, "ha_commit_trans_after_acquire_commit_lock");
+  }
+
+  if (rw_trans &&
+      opt_readonly &&
+      !(thd->security_ctx->master_access & SUPER_ACL) &&
+      !thd->slave_thread)
+  {
+    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
+    goto err;
+  }
+
+  if (trans->no_2pc || (rw_ha_count <= 1))
+  {
+    error= ha_commit_one_phase(thd, all);
+    goto done;
+  }
+
+  need_prepare_ordered= FALSE;
+  need_commit_ordered= FALSE;
+  xid= thd->transaction.xid_state.xid.get_my_xid();
+
+  for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
+  {
+    int err;
+    handlerton *ht= hi->ht();
+    /*
+      Do not call two-phase commit if this particular
+      transaction is read-only. This allows for simpler
+      implementation in engines that are always read-only.
+    */
+    if (! hi->is_trx_read_write())
+      continue;
+    /*
+      Sic: we know that prepare() is not NULL since otherwise
+      trans->no_2pc would have been set.
+    */
+    err= ht->prepare(ht, thd, all);
+    status_var_increment(thd->status_var.ha_prepare_count);
+    if (err)
+      my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
+
+    if (err)
+      goto err;
+
+    need_prepare_ordered|= (ht->prepare_ordered != NULL);
+    need_commit_ordered|= (ht->commit_ordered != NULL);
+  }
+  DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_SUICIDE(););
+
+  if (!is_real_trans)
+  {
+    error= commit_one_phase_2(thd, all, trans, is_real_trans);
+    goto done;
+  }
+
+  cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
+                                need_commit_ordered);
+  if (!cookie)
+    goto err;
+
+  DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
+
+  error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
+
+  DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_SUICIDE(););
+  if (tc_log->unlog(cookie, xid))
+  {
+    error= 2;                                /* Error during commit */
+    goto end;
+  }
+
+done:
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_SUICIDE(););
+  RUN_HOOK(transaction, after_commit, (thd, FALSE));
+  goto end;
+
+  /* Come here if error and we need to rollback. */
+err:
+  error= 1;                                  /* Transaction was rolled back */
+  ha_rollback_trans(thd, all);
+
 end:
-    if (rw_trans && mdl_request.ticket)
-    {
-      /*
-        We do not always immediately release transactional locks
-        after ha_commit_trans() (see uses of ha_enable_transaction()),
-        thus we release the commit blocker lock as soon as it's
-        not needed.
-      */
-      thd->mdl_context.release_lock(mdl_request.ticket);
-    }
+  if (rw_trans && mdl_request.ticket)
+  {
+    /*
+      We do not always immediately release transactional locks
+      after ha_commit_trans() (see uses of ha_enable_transaction()),
+      thus we release the commit blocker lock as soon as it's
+      not needed.
+    */
+    thd->mdl_context.release_lock(mdl_request.ticket);
   }
-  /* Free resources and perform other cleanup even for 'empty' transactions. */
-  else if (is_real_trans)
-    thd->transaction.cleanup();
   DBUG_RETURN(error);
 }
 
@@ -1290,7 +1335,6 @@ end:
 
 int ha_commit_one_phase(THD *thd, bool all)
 {
-  int error=0;
   THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
   /*
     "real" is a nick name for a transaction for which a commit will
@@ -1306,9 +1350,18 @@ int ha_commit_one_phase(THD *thd, bool all)
     transaction.all.ha_list, see why in trans_register_ha()).
   */
   bool is_real_trans=all || thd->transaction.all.ha_list == 0;
-  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
   DBUG_ENTER("ha_commit_one_phase");
+  int res= commit_one_phase_2(thd, all, trans, is_real_trans);
+  DBUG_RETURN(res);
+}
 
+
+static int
+commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
+{
+  int error= 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  DBUG_ENTER("commit_one_phase_2");
   if (ha_info)
   {
     for (; ha_info; ha_info= ha_info_next)
@@ -1331,7 +1384,7 @@ int ha_commit_one_phase(THD *thd, bool all)
     {
 #ifdef HAVE_QUERY_CACHE
       if (thd->transaction.changed_tables)
-        query_cache.invalidate(thd->transaction.changed_tables);
+        query_cache.invalidate(thd, thd->transaction.changed_tables);
 #endif
     }
   }
@@ -1893,7 +1946,16 @@ int ha_start_consistent_snapshot(THD *thd)
 {
   bool warn= true;
 
+  /*
+    Holding the LOCK_commit_ordered mutex ensures that we get the same
+    snapshot for all engines (including the binary log).  This allows us
+    among other things to do backups with
+    START TRANSACTION WITH CONSISTENT SNAPSHOT and
+    have a consistent binlog position.
+  */
+  mysql_mutex_lock(&LOCK_commit_ordered);
   plugin_foreach(thd, snapshot_handlerton, MYSQL_STORAGE_ENGINE_PLUGIN, &warn);
+  mysql_mutex_unlock(&LOCK_commit_ordered);
 
   /*
     Same idea as when one wants to CREATE TABLE in one engine which does not
@@ -2060,7 +2122,8 @@ int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
     dummy_share.db.length= strlen(db);
     dummy_share.table_name.str= (char*) alias;
     dummy_share.table_name.length= strlen(alias);
-    dummy_table.alias= alias;
+    dummy_table.alias.set(alias, dummy_share.table_name.length,
+                          table_alias_charset);
 
     file->change_table_ptr(&dummy_table, &dummy_share);
 
@@ -2086,28 +2149,34 @@ int ha_delete_table(THD *thd, handlerton *table_type, const char *path,
 handler *handler::clone(const char *name, MEM_ROOT *mem_root)
 {
   handler *new_handler= get_new_handler(table->s, mem_root, ht);
+  if (! new_handler)
+    return NULL;
+
   /*
     Allocate handler->ref here because otherwise ha_open will allocate it
     on this->table->mem_root and we will not be able to reclaim that memory 
     when the clone handler object is destroyed.
   */
-  if (new_handler &&
-     !(new_handler->ref= (uchar*) alloc_root(mem_root,
-                                             ALIGN_SIZE(ref_length)*2)))
-    new_handler= NULL;
+
+  if (!(new_handler->ref= (uchar*) alloc_root(mem_root,
+                                              ALIGN_SIZE(ref_length)*2)))
+    return NULL;
+
   /*
     TODO: Implement a more efficient way to have more than one index open for
     the same table instance. The ha_open call is not cachable for clone.
+
+    This is not critical as the engines already have the table open
+    and should be able to use the original instance of the table.
   */
-  if (new_handler && new_handler->ha_open(table,
-                                          name,
-                                          table->db_stat,
-                                          HA_OPEN_IGNORE_IF_LOCKED))
-    new_handler= NULL;
+  if (new_handler->ha_open(table, name, table->db_stat,
+                           HA_OPEN_IGNORE_IF_LOCKED))
+    return NULL;
 
   return new_handler;
 }
 
+
 double handler::keyread_time(uint index, uint ranges, ha_rows rows)
 {
   /*
@@ -2148,7 +2217,7 @@ PSI_table_share *handler::ha_table_share_psi(const TABLE_SHARE *share) const
     Don't wait for locks if not HA_OPEN_WAIT_IF_LOCKED is set
 */
 int handler::ha_open(TABLE *table_arg, const char *name, int mode,
-                     int test_if_locked)
+                     uint test_if_locked)
 {
   int error;
   DBUG_ENTER("handler::ha_open");
@@ -2192,11 +2261,22 @@ int handler::ha_open(TABLE *table_arg, const char *name, int mode,
       dup_ref=ref+ALIGN_SIZE(ref_length);
     cached_table_flags= table_flags();
   }
-  rows_read= rows_changed= 0;
-  memset(index_rows_read, 0, sizeof(index_rows_read));
+  reset_statistics();
+  internal_tmp_table= test(test_if_locked & HA_OPEN_INTERNAL_TABLE);
   DBUG_RETURN(error);
 }
 
+int handler::ha_close()
+{
+  DBUG_ENTER("ha_close");
+  /*
+    Increment global statistics for temporary tables.
+    In_use is 0 for tables that was closed from the table cache.
+  */
+  if (table->in_use)
+    status_var_add(table->in_use->status_var.rows_tmp_read, rows_tmp_read);
+  DBUG_RETURN(close());
+}
 
 /* Initialize handler for random reading, with error handling */
 
@@ -2589,8 +2669,9 @@ int handler::update_auto_increment()
 void handler::column_bitmaps_signal()
 {
   DBUG_ENTER("column_bitmaps_signal");
-  DBUG_PRINT("info", ("read_set: 0x%lx  write_set: 0x%lx", (long) table->read_set,
-                      (long) table->write_set));
+  if (table)
+    DBUG_PRINT("info", ("read_set: 0x%lx  write_set: 0x%lx",
+                        (long) table->read_set, (long) table->write_set));
   DBUG_VOID_RETURN;
 }
 
@@ -2667,6 +2748,7 @@ void handler::get_auto_increment(ulonglong offset, ulonglong increment,
 
 void handler::ha_release_auto_increment()
 {
+  DBUG_ENTER("ha_release_auto_increment");
   release_auto_increment();
   insert_id_for_cur_row= 0;
   auto_inc_interval_for_cur_row.replace(0, 0, 0);
@@ -2680,6 +2762,7 @@ void handler::ha_release_auto_increment()
     */
     table->in_use->auto_inc_intervals_forced.empty();
   }
+  DBUG_VOID_RETURN;
 }
 
 
@@ -2721,17 +2804,11 @@ void handler::print_keydup_error(uint key_nr, const char *msg)
     - table->alias
 */
 
-#ifndef DBUG_OFF
 #define SET_FATAL_ERROR fatal_error=1
-#else
-#define SET_FATAL_ERROR
-#endif
 
 void handler::print_error(int error, myf errflag)
 {
-#ifndef DBUG_OFF
   bool fatal_error= 0;
-#endif
   DBUG_ENTER("handler::print_error");
   DBUG_PRINT("enter",("error: %d",error));
 
@@ -2746,6 +2823,11 @@ void handler::print_error(int error, myf errflag)
   case ENOENT:
     textno=ER_FILE_NOT_FOUND;
     break;
+  case ENOSPC:
+  case HA_ERR_DISK_FULL:
+    textno= ER_DISK_FULL;
+    SET_FATAL_ERROR;                            // Ensure error is logged
+    break;
   case HA_ERR_KEY_NOT_FOUND:
   case HA_ERR_NO_ACTIVE_RECORD:
   case HA_ERR_RECORD_DELETED:
@@ -2759,6 +2841,12 @@ void handler::print_error(int error, myf errflag)
     SET_FATAL_ERROR;
     textno=ER_KEY_NOT_FOUND;
     break;
+  case HA_ERR_ABORTED_BY_USER:
+  {
+    DBUG_ASSERT(table->in_use->killed);
+    table->in_use->send_kill_message();
+    DBUG_VOID_RETURN;
+  }
   case HA_ERR_WRONG_MRG_TABLE_DEF:
     textno=ER_WRONG_MRG_TABLE;
     break;
@@ -2808,7 +2896,10 @@ void handler::print_error(int error, myf errflag)
     textno=ER_DUP_UNIQUE;
     break;
   case HA_ERR_RECORD_CHANGED:
-    SET_FATAL_ERROR;
+    /*
+      This is not fatal error when using HANDLER interface
+      SET_FATAL_ERROR;
+    */
     textno=ER_CHECKREAD;
     break;
   case HA_ERR_CRASHED:
@@ -2930,11 +3021,12 @@ void handler::print_error(int error, myf errflag)
       {
 	const char* engine= table_type();
 	if (temporary)
-	  my_error(ER_GET_TEMPORARY_ERRMSG, MYF(0), error, str.ptr(), engine);
+	  my_error(ER_GET_TEMPORARY_ERRMSG, MYF(0), error, str.c_ptr(),
+                   engine);
 	else
         {
           SET_FATAL_ERROR;
-	  my_error(ER_GET_ERRMSG, MYF(0), error, str.ptr(), engine);
+	  my_error(ER_GET_ERRMSG, MYF(0), error, str.c_ptr(), engine);
         }
       }
       else
@@ -2942,6 +3034,15 @@ void handler::print_error(int error, myf errflag)
       DBUG_VOID_RETURN;
     }
   }
+  if (fatal_error && (debug_assert_if_crashed_table ||
+                      global_system_variables.log_warnings > 1))
+  {
+    /*
+      Log error to log before we crash or if extended warnings are requested
+    */
+    errflag|= ME_NOREFRESH;
+  }
+    
   my_error(textno, errflag, table_share->table_name.str, error);
   DBUG_VOID_RETURN;
 }
@@ -3197,7 +3298,7 @@ int handler::rename_table(const char * from, const char * to)
 
 void handler::drop_table(const char *name)
 {
-  close();
+  ha_close();
   delete_table(name);
 }
 
@@ -3498,6 +3599,9 @@ handler::ha_delete_table(const char *name)
   Drop table in the engine: public interface.
 
   @sa handler::drop_table()
+
+  The difference between this and delete_table() is that the table is open in
+  drop_table().
 */
 
 void
@@ -3702,6 +3806,7 @@ void handler::update_global_table_stats()
   TABLE_STATS * table_stats;
 
   status_var_add(table->in_use->status_var.rows_read, rows_read);
+  DBUG_ASSERT(rows_tmp_read == 0);
 
   if (!table->in_use->userstat_running)
   {
@@ -3952,6 +4057,7 @@ ha_check_if_table_exists(THD* thd, const char *db, const char *name,
 void st_ha_check_opt::init()
 {
   flags= sql_flags= 0;
+  start_time= my_time(0);
 }
 
 
@@ -4487,11 +4593,11 @@ int handler::index_read_idx_map(uchar * buf, uint index, const uchar * key,
   int error, error1;
   LINT_INIT(error1);
 
-  error= index_init(index, 0);
+  error= ha_index_init(index, 0);
   if (!error)
   {
     error= index_read_map(buf, key, keypart_map, find_flag);
-    error1= index_end();
+    error1= ha_index_end();
   }
   return error ?  error : error1;
 }
@@ -4666,7 +4772,8 @@ static bool check_table_binlog_row_based(THD *thd, TABLE *table)
 
 /** @brief
    Write table maps for all (manually or automatically) locked tables
-   to the binary log.
+   to the binary log. Also, if binlog_annotate_rows_events is ON,
+   write Annotate_rows event before the first table map.
 
    SYNOPSIS
      write_locked_table_maps()
@@ -4698,6 +4805,9 @@ static int write_locked_table_maps(THD *thd)
     MYSQL_LOCK *locks[2];
     locks[0]= thd->extra_lock;
     locks[1]= thd->lock;
+    my_bool with_annotate= thd->variables.binlog_annotate_rows_events &&
+                           thd->query() && thd->query_length();
+
     for (uint i= 0 ; i < sizeof(locks)/sizeof(*locks) ; ++i )
     {
       MYSQL_LOCK const *const lock= locks[i];
@@ -4729,7 +4839,8 @@ static int write_locked_table_maps(THD *thd)
           */
           bool const has_trans= thd->lex->sql_command == SQLCOM_CREATE_TABLE ||
                                 table->file->has_transactions();
-          int const error= thd->binlog_write_table_map(table, has_trans);
+          int const error= thd->binlog_write_table_map(table, has_trans,
+                                                       &with_annotate);
           /*
             If an error occurs, it is the responsibility of the caller to
             roll back the transaction.
diff --git a/sql/handler.h b/sql/handler.h
index c64859bafc3..b27c897364b 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1,7 +1,7 @@
 #ifndef HANDLER_INCLUDED
 #define HANDLER_INCLUDED
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
-   Copyright (c) 2010-2011 Monty Program Ab
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
@@ -170,8 +170,9 @@
 
 /* Has automatic checksums and uses the new checksum format */
 #define HA_HAS_NEW_CHECKSUM    (LL(1) << 38)
-
-#define HA_MRR_CANT_SORT       (LL(1) << 39)
+#define HA_CAN_VIRTUAL_COLUMNS (LL(1) << 39)
+#define HA_MRR_CANT_SORT       (LL(1) << 40)
+#define HA_RECORD_MUST_BE_CLEAN_ON_WRITE (LL(1) << 41)
 
 /*
   Set of all binlog flags. Currently only contain the capabilities
@@ -193,8 +194,11 @@
 */
 #define HA_KEY_SCAN_NOT_ROR     128 
 #define HA_DO_INDEX_COND_PUSHDOWN  256 /* Supports Index Condition Pushdown */
-
-
+/*
+  Data is clustered on this key. This means that when you read the key
+  you also get the row data without any additional disk reads.
+*/
+#define HA_CLUSTERED_INDEX      512
 
 /*
   bits in alter_table_flags:
@@ -687,6 +691,11 @@ struct handler_log_file_data {
 
   See ha_example.cc for an example.
 */
+
+struct ha_table_option_struct;
+struct ha_field_option_struct;
+struct ha_index_option_struct;
+
 enum ha_option_type { HA_OPTION_TYPE_ULL,    /* unsigned long long */
                       HA_OPTION_TYPE_STRING, /* char * */
                       HA_OPTION_TYPE_ENUM,   /* uint */
@@ -859,12 +868,113 @@ struct handlerton
      NOTE 'all' is also false in auto-commit mode where 'end of statement'
      and 'real commit' mean the same event.
    */
-   int  (*commit)(handlerton *hton, THD *thd, bool all);
+   int (*commit)(handlerton *hton, THD *thd, bool all);
+   /*
+     The commit_ordered() method is called prior to the commit() method, after
+     the transaction manager has decided to commit (not rollback) the
+     transaction. Unlike commit(), commit_ordered() is called only when the
+     full transaction is committed, not for each commit of statement
+     transaction in a multi-statement transaction.
+
+     Not that like prepare(), commit_ordered() is only called when 2-phase
+     commit takes place. Ie. when no binary log and only a single engine
+     participates in a transaction, one commit() is called, no
+     commit_ordered(). So engines must be prepared for this.
+
+     The calls to commit_ordered() in multiple parallel transactions is
+     guaranteed to happen in the same order in every participating
+     handler. This can be used to ensure the same commit order among multiple
+     handlers (eg. in table handler and binlog). So if transaction T1 calls
+     into commit_ordered() of handler A before T2, then T1 will also call
+     commit_ordered() of handler B before T2.
+
+     Engines that implement this method should during this call make the
+     transaction visible to other transactions, thereby making the order of
+     transaction commits be defined by the order of commit_ordered() calls.
+
+     The intention is that commit_ordered() should do the minimal amount of
+     work that needs to happen in consistent commit order among handlers. To
+     preserve ordering, calls need to be serialised on a global mutex, so
+     doing any time-consuming or blocking operations in commit_ordered() will
+     limit scalability.
+
+     Handlers can rely on commit_ordered() calls to be serialised (no two
+     calls can run in parallel, so no extra locking on the handler part is
+     required to ensure this).
+
+     Note that commit_ordered() can be called from a different thread than the
+     one handling the transaction! So it can not do anything that depends on
+     thread local storage, in particular it can not call my_error() and
+     friends (instead it can store the error code and delay the call of
+     my_error() to the commit() method).
+
+     Similarly, since commit_ordered() returns void, any return error code
+     must be saved and returned from the commit() method instead.
+
+     The commit_ordered method is optional, and can be left unset if not
+     needed in a particular handler (then there will be no ordering guarantees
+     wrt. other engines and binary log).
+   */
+   void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
    int  (*rollback)(handlerton *hton, THD *thd, bool all);
    int  (*prepare)(handlerton *hton, THD *thd, bool all);
+   /*
+     The prepare_ordered method is optional. If set, it will be called after
+     successful prepare() in all handlers participating in 2-phase
+     commit. Like commit_ordered(), it is called only when the full
+     transaction is committed, not for each commit of statement transaction.
+
+     The calls to prepare_ordered() among multiple parallel transactions are
+     ordered consistently with calls to commit_ordered(). This means that
+     calls to prepare_ordered() effectively define the commit order, and that
+     each handler will see the same sequence of transactions calling into
+     prepare_ordered() and commit_ordered().
+
+     Thus, prepare_ordered() can be used to define commit order for handlers
+     that need to do this in the prepare step (like binlog). It can also be
+     used to release transaction's locks early in an order consistent with the
+     order transactions will be eventually committed.
+
+     Like commit_ordered(), prepare_ordered() calls are serialised to maintain
+     ordering, so the intention is that they should execute fast, with only
+     the minimal amount of work needed to define commit order. Handlers can
+     rely on this serialisation, and do not need to do any extra locking to
+     avoid two prepare_ordered() calls running in parallel.
+
+     Like commit_ordered(), prepare_ordered() is not guaranteed to be called
+     in the context of the thread handling the rest of the transaction. So it
+     cannot invoke code that relies on thread local storage, in particular it
+     cannot call my_error().
+
+     prepare_ordered() cannot cause a rollback by returning an error, all
+     possible errors must be handled in prepare() (the prepare_ordered()
+     method returns void). In case of some fatal error, a record of the error
+     must be made internally by the engine and returned from commit() later.
+
+     Note that for user-level XA SQL commands, no consistent ordering among
+     prepare_ordered() and commit_ordered() is guaranteed (as that would
+     require blocking all other commits for an indefinite time).
+
+     When 2-phase commit is not used (eg. only one engine (and no binlog) in
+     transaction), neither prepare() nor prepare_ordered() is called.
+   */
+   void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
    int  (*recover)(handlerton *hton, XID *xid_list, uint len);
    int  (*commit_by_xid)(handlerton *hton, XID *xid);
    int  (*rollback_by_xid)(handlerton *hton, XID *xid);
+  /*
+    "Disable or enable checkpointing internal to the storage engine. This is
+    used for FLUSH TABLES WITH READ LOCK AND DISABLE CHECKPOINT to ensure that
+    the engine will never start any recovery from a time between
+    FLUSH TABLES ... ; UNLOCK TABLES.
+
+    While checkpointing is disabled, the engine should pause any background
+    write activity (such as tablespace checkpointing) that require consistency
+    between different files (such as transaction log and tablespace files) for
+    crash recovery to succeed. The idea is to use this to make safe
+    multi-volume LVM snapshot backups.
+  */
+   int  (*checkpoint_state)(handlerton *hton, bool disabled);
    void *(*create_cursor_read_view)(handlerton *hton, THD *thd);
    void (*set_cursor_read_view)(handlerton *hton, THD *thd, void *read_view);
    void (*close_cursor_read_view)(handlerton *hton, THD *thd, void *read_view);
@@ -1151,9 +1261,9 @@ typedef struct st_ha_create_information
   enum ha_choice page_checksum;         ///< If we have page_checksums
   engine_option_value *option_list;     ///< list of table create options
   /* the following three are only for ALTER TABLE, check_if_incompatible_data() */
-  void *option_struct;           ///< structure with parsed table options
-  void **fileds_option_struct;   ///< array of field option structures
-  void **indexes_option_struct;  ///< array of index option structures
+  ha_table_option_struct *option_struct;           ///< structure with parsed table options
+  ha_field_option_struct **fields_option_struct;   ///< array of field option structures
+  ha_index_option_struct **indexes_option_struct;  ///< array of index option structures
 } HA_CREATE_INFO;
 
 
@@ -1228,6 +1338,7 @@ typedef struct st_ha_check_opt
   st_ha_check_opt() {}                        /* Remove gcc warning */
   uint flags;       /* isam layer flags (e.g. for myisamchk) */
   uint sql_flags;   /* sql layer flags - for something myisamchk cannot do */
+  time_t start_time;   /* When check/repair starts */
   KEY_CACHE *key_cache; /* new key cache when changing key cache */
   void init();
 } HA_CHECK_OPT;
@@ -1242,6 +1353,23 @@ typedef void *range_seq_t;
 typedef struct st_range_seq_if
 {
   /*
+    Get key information
+ 
+    SYNOPSIS
+      get_key_info()
+        init_params  The seq_init_param parameter 
+        length       OUT length of the keys in this range sequence
+        map          OUT key_part_map of the keys in this range sequence
+
+    DESCRIPTION
+      This function is set only when using HA_MRR_FIXED_KEY mode. In that mode, 
+      all ranges are single-point equality ranges that use the same set of key
+      parts. This function allows the MRR implementation to get the length of
+      a key, and which keyparts it uses.
+  */
+  void (*get_key_info)(void *init_params, uint *length, key_part_map *map);
+
+  /*
     Initialize the traversal of range sequence
     
     SYNOPSIS
@@ -1265,10 +1393,10 @@ typedef struct st_range_seq_if
         range  OUT Information about the next range
     
     RETURN
-      0 - Ok, the range structure filled with info about the next range
-      1 - No more ranges
+      FALSE - Ok, the range structure filled with info about the next range
+      TRUE  - No more ranges
   */
-  uint (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
+  bool (*next) (range_seq_t seq, KEY_MULTI_RANGE *range);
 
   /*
     Check whether range_info orders to skip the next record
@@ -1285,7 +1413,7 @@ typedef struct st_range_seq_if
           out from the stream of records returned by multi_range_read_next()
       0 - The record shall be left in the stream
   */ 
-  bool (*skip_record) (range_seq_t seq, char *range_info, uchar *rowid);
+  bool (*skip_record) (range_seq_t seq, range_id_t range_info, uchar *rowid);
 
   /*
     Check if the record combination matches the index condition
@@ -1298,9 +1426,11 @@ typedef struct st_range_seq_if
       0 - The record combination satisfies the index condition
       1 - Otherwise
   */ 
-  bool (*skip_index_tuple) (range_seq_t seq, char *range_info);
+  bool (*skip_index_tuple) (range_seq_t seq, range_id_t range_info);
 } RANGE_SEQ_IF;
 
+typedef bool (*SKIP_INDEX_TUPLE_FUNC) (range_seq_t seq, range_id_t range_info);
+
 class COST_VECT
 { 
 public:
@@ -1346,10 +1476,14 @@ public:
   }
   void add_io(double add_io_cnt, double add_avg_cost)
   {
-    double io_count_sum= io_count + add_io_cnt;
-    avg_io_cost= (io_count * avg_io_cost + 
-                  add_io_cnt * add_avg_cost) / io_count_sum;
-    io_count= io_count_sum;
+    /* In edge cases add_io_cnt may be zero */
+    if (add_io_cnt > 0)
+    {
+      double io_count_sum= io_count + add_io_cnt;
+      avg_io_cost= (io_count * avg_io_cost + 
+                    add_io_cnt * add_avg_cost) / io_count_sum;
+      io_count= io_count_sum;
+    }
   }
 
   /*
@@ -1368,9 +1502,9 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
                          COST_VECT *cost);
 
 /*
-  The below two are not used (and not handled) in this milestone of this WL
-  entry because there seems to be no use for them at this stage of
-  implementation.
+  Indicates that all scanned ranges will be singlepoint (aka equality) ranges.
+  The ranges may not use the full key but all of them will use the same number
+  of key parts.
 */
 #define HA_MRR_SINGLE_POINT 1
 #define HA_MRR_FIXED_KEY  2
@@ -1412,7 +1546,42 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
 */
 #define HA_MRR_NO_NULL_ENDPOINTS 128
 
+/*
+  The MRR user has materialized range keys somewhere in the user's buffer.
+  This can be used for optimization of the procedure that sorts these keys
+  since in this case key values don't have to be copied into the MRR buffer.
+
+  In other words, it is guaranteed that after RANGE_SEQ_IF::next() call the 
+  pointer in range->start_key.key will point to a key value that will remain 
+  there until the end of the MRR scan.
+*/
+#define HA_MRR_MATERIALIZED_KEYS 256
+
+/*
+  The following bits are reserved for use by MRR implementation. The intended
+  use scenario:
+
+  * sql layer calls handler->multi_range_read_info[_const]() 
+    - MRR implementation figures out what kind of scan it will perform, saves
+      the result in *mrr_mode parameter.
+  * sql layer remembers what was returned in *mrr_mode
+
+  * the optimizer picks the query plan (which may or may not include the MRR 
+    scan that was estimated by the multi_range_read_info[_const] call)
+
+  * if the query is an EXPLAIN statement, sql layer will call 
+    handler->multi_range_read_explain_info(mrr_mode) to get a text description
+    of the picked MRR scan; the description will be a part of EXPLAIN output.
+*/
+#define HA_MRR_IMPLEMENTATION_FLAG1 512
+#define HA_MRR_IMPLEMENTATION_FLAG2 1024
+#define HA_MRR_IMPLEMENTATION_FLAG3 2048
+#define HA_MRR_IMPLEMENTATION_FLAG4 4096
+#define HA_MRR_IMPLEMENTATION_FLAG5 8192
+#define HA_MRR_IMPLEMENTATION_FLAG6 16384
 
+#define HA_MRR_IMPLEMENTATION_FLAGS \
+  (512 | 1024 | 2048 | 4096 | 8192 | 16384)
 
 /*
   This is a buffer area that the handler can use to store rows.
@@ -1542,6 +1711,7 @@ public:
   KEY_PART_INFO *range_key_part;
   int key_compare_result_on_equal;
   bool eq_range;
+  bool internal_tmp_table;                      /* If internal tmp table */
 
   uint errkey;				/* Last dup key */
   uint key_used_on_scan;
@@ -1583,6 +1753,7 @@ public:
   */
   /* Statistics  variables */
   ulonglong rows_read;
+  ulonglong rows_tmp_read;
   ulonglong rows_changed;
   /* One bigger than needed to avoid to test if key == MAX_KEY */
   ulonglong index_rows_read[MAX_KEY+1];
@@ -1642,23 +1813,27 @@ public:
   }
   /* ha_ methods: pubilc wrappers for private virtual API */
 
-  int ha_open(TABLE *table, const char *name, int mode, int test_if_locked);
+  int ha_open(TABLE *table, const char *name, int mode, uint test_if_locked);
   int ha_index_init(uint idx, bool sorted)
   {
     int result;
     DBUG_ENTER("ha_index_init");
     DBUG_ASSERT(inited==NONE);
     if (!(result= index_init(idx, sorted)))
-      inited=INDEX;
-    end_range= NULL;
+    {
+      inited=       INDEX;
+      active_index= idx;
+      end_range= NULL;
+    }
     DBUG_RETURN(result);
   }
   int ha_index_end()
   {
     DBUG_ENTER("ha_index_end");
     DBUG_ASSERT(inited==INDEX);
-    inited=NONE;
-    end_range= NULL;
+    inited=       NONE;
+    active_index= MAX_KEY;
+    end_range=    NULL;
     DBUG_RETURN(index_end());
   }
   /* This is called after index_init() if we need to do a index scan */
@@ -1765,7 +1940,7 @@ public:
   uint get_dup_key(int error);
   void reset_statistics()
   {
-    rows_read= rows_changed= 0;
+    rows_read= rows_changed= rows_tmp_read= 0;
     bzero(index_rows_read, sizeof(index_rows_read));
   }
   virtual void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share)
@@ -1844,8 +2019,13 @@ public:
     as there may be several calls to this routine.
   */
   virtual void column_bitmaps_signal();
-  uint get_index(void) const { return active_index; }
-  virtual int close(void)=0;
+  /*
+    We have to check for inited as some engines, like innodb, sets
+    active_index during table scan.
+  */
+  uint get_index(void) const
+  { return inited == INDEX ? active_index : MAX_KEY; }
+  int ha_close(void);
 
   /**
     @retval  0   Bulk update used by handler
@@ -1921,10 +2101,18 @@ protected:
   virtual int index_last(uchar * buf)
    { return  HA_ERR_WRONG_COMMAND; }
   virtual int index_next_same(uchar *buf, const uchar *key, uint keylen);
+  virtual int close(void)=0;
+  inline void update_rows_read()
+  {
+    if (likely(!internal_tmp_table))
+      rows_read++;
+    else
+      rows_tmp_read++;
+  }
   inline void update_index_statistics()
   {
     index_rows_read[active_index]++;
-    rows_read++;
+    update_rows_read();
   }
 public:
 
@@ -1940,16 +2128,47 @@ public:
   inline int ha_index_first(uchar * buf);
   inline int ha_index_last(uchar * buf);
   inline int ha_index_next_same(uchar *buf, const uchar *key, uint keylen);
+  /*
+    TODO: should we make for those functions non-virtual ha_func_name wrappers,
+    too?
+  */
   virtual ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
                                               void *seq_init_param, 
                                               uint n_ranges, uint *bufsz,
-                                              uint *flags, COST_VECT *cost);
+                                              uint *mrr_mode, COST_VECT *cost);
   virtual ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
-                                        uint *bufsz, uint *flags, COST_VECT *cost);
+                                        uint key_parts, uint *bufsz, 
+                                        uint *mrr_mode, COST_VECT *cost);
   virtual int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
-                                    uint n_ranges, uint mode,
+                                    uint n_ranges, uint mrr_mode, 
                                     HANDLER_BUFFER *buf);
-  virtual int multi_range_read_next(char **range_info);
+  virtual int multi_range_read_next(range_id_t *range_info);
+  /*
+    Return string representation of the MRR plan.
+
+    This is intended to be used for EXPLAIN, via the following scenario:
+    1. SQL layer calls handler->multi_range_read_info().
+    1.1. Storage engine figures out whether it will use some non-default
+         MRR strategy, sets appropritate bits in *mrr_mode, and returns 
+         control to SQL layer
+    2. SQL layer remembers the returned mrr_mode
+    3. SQL layer compares various options and choses the final query plan. As
+       a part of that, it makes a choice of whether to use the MRR strategy
+       picked in 1.1
+    4. EXPLAIN code converts the query plan to its text representation. If MRR
+       strategy is part of the plan, it calls
+       multi_range_read_explain_info(mrr_mode) to get a text representation of
+       the picked MRR strategy.
+
+    @param mrr_mode   Mode which was returned by multi_range_read_info[_const]
+    @param str        INOUT string to be printed for EXPLAIN
+    @param str_end    End of the string buffer. The function is free to put the 
+                      string into [str..str_end] memory range.
+  */
+  virtual int multi_range_read_explain_info(uint mrr_mode, char *str, 
+                                            size_t size)
+  { return 0; }
+
   virtual int read_range_first(const key_range *start_key,
                                const key_range *end_key,
                                bool eq_range, bool sorted);
@@ -2090,6 +2309,7 @@ public:
   { return(NULL);}  /* gets tablespace name from handler */
   /** used in ALTER TABLE; 1 if changing storage engine is allowed */
   virtual bool can_switch_engines() { return 1; }
+  virtual int can_continue_handler_scan() { return 0; }
   /**
     Get the list of foreign keys in this table.
 
@@ -2204,7 +2424,6 @@ public:
   virtual uint max_supported_key_part_length() const { return 255; }
   virtual uint min_record_length(uint options) const { return 1; }
 
-  virtual bool low_byte_first() const { return 1; }
   virtual uint checksum() const { return 0; }
   virtual bool is_crashed() const  { return 0; }
   virtual bool auto_repair() const { return 0; }
@@ -2284,9 +2503,28 @@ public:
 
 
  /*
-   @retval TRUE   Primary key (if there is one) is clustered
-                  key covering all fields
-   @retval FALSE  otherwise
+   Check if the primary key (if there is one) is a clustered and a
+   reference key. This means:
+
+   - Data is stored together with the primary key (no secondary lookup
+     needed to find the row data). The optimizer uses this to find out
+     the cost of fetching data.
+   - The primary key is part of each secondary key and is used
+     to find the row data in the primary index when reading trough
+     secondary indexes.
+   - When doing a HA_KEYREAD_ONLY we get also all the primary key parts
+     into the row. This is critical property used by index_merge.
+
+   All the above is usually true for engines that store the row
+   data in the primary key index (e.g. in a b-tree), and use the primary
+   key value as a position().  InnoDB is an example of such an engine.
+
+   For such a clustered primary key, the following should also hold:
+   index_flags() should contain HA_CLUSTERED_INDEX
+   table_flags() should contain HA_TABLE_SCAN_ON_INDEX
+
+   @retval TRUE   yes
+   @retval FALSE  No.
  */
  virtual bool primary_key_is_clustered() { return FALSE; }
  virtual int cmp_ref(const uchar *ref1, const uchar *ref2)
@@ -2358,7 +2596,8 @@ public:
   */
 
   virtual bool check_if_supported_virtual_columns(void) { return FALSE;}
-
+  
+  TABLE* get_table() { return table; }
 protected:
   /* deprecated, don't use in new engines */
   inline void ha_statistic_increment(ulong SSV::*offset) const { }
@@ -2432,8 +2671,9 @@ private:
   */
 
   virtual int open(const char *name, int mode, uint test_if_locked)=0;
-  virtual int index_init(uint idx, bool sorted) { active_index= idx; return 0; }
-  virtual int index_end() { active_index= MAX_KEY; return 0; }
+  /* Note: ha_index_read_idx_map() may buypass index_init() */
+  virtual int index_init(uint idx, bool sorted) { return 0; }
+  virtual int index_end() { return 0; }
   /**
     rnd_init() can be called two times without rnd_end() in between
     (it only makes sense if scan=1).
@@ -2599,11 +2839,12 @@ private:
   virtual int rename_partitions(const char *path)
   { return HA_ERR_WRONG_COMMAND; }
   friend class ha_partition;
-  friend class DsMrr_impl;
 public:
   /* XXX to be removed, see ha_partition::partition_ht() */
   virtual handlerton *partition_ht() const
   { return ht; }
+  inline int ha_write_tmp_row(uchar *buf);
+  inline int ha_update_tmp_row(const uchar * old_data, uchar * new_data);
 };
 
 #include "multi_range_read.h"
@@ -2663,6 +2904,7 @@ int ha_panic(enum ha_panic_function flag);
 void ha_close_connection(THD* thd);
 bool ha_flush_logs(handlerton *db_type);
 void ha_drop_database(char* path);
+void ha_checkpoint_state(bool disable);
 int ha_create_table(THD *thd, const char *path,
                     const char *db, const char *table_name,
                     HA_CREATE_INFO *create_info,
diff --git a/sql/hostname.cc b/sql/hostname.cc
index d34df68587c..763c4647532 100644
--- a/sql/hostname.cc
+++ b/sql/hostname.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/item.cc b/sql/item.cc
index 48449f9033d..0d4b25f4440 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -47,6 +47,16 @@ const String my_null_string("NULL", 4, default_charset_info);
 static int save_field_in_field(Field *from, bool *null_value,
                                Field *to, bool no_conversions);
 
+
+/**
+  Compare two Items for List<Item>::add_unique()
+*/
+
+bool cmp_items(Item *a, Item *b)
+{
+  return a->eq(b, FALSE);
+}
+
 /****************************************************************************/
 
 /* Hybrid_type_traits {_real} */
@@ -213,10 +223,12 @@ bool Item::val_bool()
   case STRING_RESULT:
     return val_real() != 0.0;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
     return 0;                                   // Wrong (but safe)
   }
+  return 0;                                   // Wrong (but safe)
 }
 
 
@@ -254,7 +266,7 @@ String *Item::val_string_from_real(String *str)
   double nr= val_real();
   if (null_value)
     return 0;					/* purecov: inspected */
-  str->set_real(nr,decimals, &my_charset_bin);
+  str->set_real(nr,decimals, &my_charset_numeric);
   return str;
 }
 
@@ -264,7 +276,7 @@ String *Item::val_string_from_int(String *str)
   longlong nr= val_int();
   if (null_value)
     return 0;
-  str->set_int(nr, unsigned_flag, &my_charset_bin);
+  str->set_int(nr, unsigned_flag, &my_charset_numeric);
   return str;
 }
 
@@ -280,6 +292,21 @@ String *Item::val_string_from_decimal(String *str)
 }
 
 
+String *Item::val_string_from_date(String *str)
+{
+  MYSQL_TIME ltime;
+  if (get_date(&ltime, TIME_FUZZY_DATE) ||
+      str->alloc(MAX_DATE_STRING_REP_LENGTH))
+  {
+    null_value= 1;
+    return (String *) 0;
+  }
+  str->length(my_TIME_to_str(&ltime, const_cast<char*>(str->ptr()), decimals));
+  str->set_charset(&my_charset_numeric);
+  return str;
+}
+
+
 my_decimal *Item::val_decimal_from_real(my_decimal *decimal_value)
 {
   double nr= val_real();
@@ -377,17 +404,20 @@ int Item::save_time_in_field(Field *field)
   if (get_time(&ltime))
     return set_field_to_null_with_conversions(field, 0);
   field->set_notnull();
-  return field->store_time(&ltime, MYSQL_TIMESTAMP_TIME);
+  return field->store_time_dec(&ltime, decimals);
 }
 
 
 int Item::save_date_in_field(Field *field)
 {
   MYSQL_TIME ltime;
-  if (get_date(&ltime, TIME_FUZZY_DATE))
+  if (get_date(&ltime, TIME_FUZZY_DATE |
+                       (current_thd->variables.sql_mode &
+                          (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE |
+                             MODE_INVALID_DATES))))
     return set_field_to_null_with_conversions(field, 0);
   field->set_notnull();
-  return field->store_time(&ltime, MYSQL_TIMESTAMP_DATETIME);
+  return field->store_time_dec(&ltime, decimals);
 }
 
 
@@ -428,10 +458,13 @@ Item::Item():
   collation(&my_charset_bin, DERIVATION_COERCIBLE)
 {
   marker= 0;
-  maybe_null=null_value=with_sum_func=unsigned_flag=0;
+  maybe_null=null_value=with_sum_func=with_field=unsigned_flag=0;
+  in_rollup= 0;
   decimals= 0; max_length= 0;
   with_subselect= 0;
-  cmp_context= (Item_result)-1;
+  cmp_context= IMPOSSIBLE_RESULT;
+   /* Initially this item is not attached to any JOIN_TAB. */
+  join_tab_idx= MAX_TABLES;
 
   /* Put item in free list so that we can free all items at end */
   THD *thd= current_thd;
@@ -460,6 +493,7 @@ Item::Item():
   tables.
 */
 Item::Item(THD *thd, Item *item):
+  join_tab_idx(item->join_tab_idx),
   is_expensive_cache(-1),
   rsize(0),
   str_value(item->str_value),
@@ -470,9 +504,11 @@ Item::Item(THD *thd, Item *item):
   marker(item->marker),
   decimals(item->decimals),
   maybe_null(item->maybe_null),
+  in_rollup(item->in_rollup),
   null_value(item->null_value),
   unsigned_flag(item->unsigned_flag),
   with_sum_func(item->with_sum_func),
+  with_field(item->with_field),
   fixed(item->fixed),
   is_autogenerated_name(item->is_autogenerated_name),
   with_subselect(item->with_subselect),
@@ -512,11 +548,40 @@ void Item::print_item_w_name(String *str, enum_query_type query_type)
 }
 
 
+void Item::print_value(String *str)
+{
+  char buff[MAX_FIELD_WIDTH];
+  String *ptr, tmp(buff,sizeof(buff),str->charset());
+  ptr= val_str(&tmp);
+  if (!ptr)
+    str->append("NULL");
+  else
+  {
+    switch (result_type()) {
+    case STRING_RESULT:
+      append_unescaped(str, ptr->ptr(), ptr->length());
+      break;
+    case DECIMAL_RESULT:
+    case REAL_RESULT:
+    case INT_RESULT:
+      str->append(*ptr);
+      break;
+    case ROW_RESULT:
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
+      DBUG_ASSERT(0);
+    }
+  }
+}
+
+
 void Item::cleanup()
 {
   DBUG_ENTER("Item::cleanup");
+  DBUG_PRINT("enter", ("this: %p", this));
   fixed=0;
   marker= 0;
+  join_tab_idx= MAX_TABLES;
   if (orig_name)
     name= orig_name;
   DBUG_VOID_RETURN;
@@ -554,6 +619,45 @@ void Item::rename(char *new_name)
   name= new_name;
 }
 
+Item_result Item::cmp_type() const
+{
+  switch (field_type()) {
+  case MYSQL_TYPE_DECIMAL:
+  case MYSQL_TYPE_NEWDECIMAL:
+                           return DECIMAL_RESULT;
+  case MYSQL_TYPE_TINY:
+  case MYSQL_TYPE_SHORT:
+  case MYSQL_TYPE_LONG:
+  case MYSQL_TYPE_LONGLONG:
+  case MYSQL_TYPE_INT24:
+  case MYSQL_TYPE_YEAR:
+  case MYSQL_TYPE_BIT:
+                           return INT_RESULT;
+  case MYSQL_TYPE_FLOAT:
+  case MYSQL_TYPE_DOUBLE:
+                           return REAL_RESULT;
+  case MYSQL_TYPE_NULL:
+  case MYSQL_TYPE_VARCHAR:
+  case MYSQL_TYPE_TINY_BLOB:
+  case MYSQL_TYPE_MEDIUM_BLOB:
+  case MYSQL_TYPE_LONG_BLOB:
+  case MYSQL_TYPE_BLOB:
+  case MYSQL_TYPE_VAR_STRING:
+  case MYSQL_TYPE_STRING:
+  case MYSQL_TYPE_ENUM:
+  case MYSQL_TYPE_SET:
+  case MYSQL_TYPE_GEOMETRY:
+                           return STRING_RESULT;
+  case MYSQL_TYPE_TIMESTAMP:
+  case MYSQL_TYPE_DATE:
+  case MYSQL_TYPE_TIME:
+  case MYSQL_TYPE_DATETIME:
+  case MYSQL_TYPE_NEWDATE:
+                           return TIME_RESULT;
+  };
+  DBUG_ASSERT(0);
+  return IMPOSSIBLE_RESULT;
+}
 
 /**
   Traverse item tree possibly transforming it (replacing items).
@@ -607,14 +711,14 @@ Item* Item::transform(Item_transformer transformer, uchar *arg)
   A pointer to created wrapper item if successful, NULL - otherwise
 */
 
-Item* Item::set_expr_cache(THD *thd, List<Item *> &depends_on)
+Item* Item::set_expr_cache(THD *thd)
 {
   DBUG_ENTER("Item::set_expr_cache");
   Item_cache_wrapper *wrapper;
   if ((wrapper= new Item_cache_wrapper(this)) &&
       !wrapper->fix_fields(thd, (Item**)&wrapper))
   {
-    if (wrapper->set_cache(thd, depends_on))
+    if (wrapper->set_cache(thd))
       DBUG_RETURN(NULL);
     DBUG_RETURN(wrapper);
   }
@@ -691,13 +795,22 @@ void Item_ident::cleanup()
 bool Item_ident::remove_dependence_processor(uchar * arg)
 {
   DBUG_ENTER("Item_ident::remove_dependence_processor");
-  if (depended_from == (st_select_lex *) arg)
+  if (get_depended_from() == (st_select_lex *) arg)
     depended_from= 0;
   context= &((st_select_lex *) arg)->context;
   DBUG_RETURN(0);
 }
 
 
+bool Item_ident::collect_outer_ref_processor(uchar *param)
+{
+  Collect_deps_prm *prm= (Collect_deps_prm *)param;
+  if (depended_from && depended_from->nest_level < prm->nest_level)
+    prm->parameters->add_unique(this, &cmp_items);
+  return FALSE;
+}
+
+
 /**
   Store the pointer to this item field into a list if not already there.
 
@@ -806,6 +919,23 @@ bool Item_field::register_field_in_bitmap(uchar *arg)
   return 0;
 }
 
+
+/*
+  Mark field in write_map
+
+  NOTES
+    This is used by UPDATE to register underlying fields of used view fields.
+*/
+
+bool Item_field::register_field_in_write_map(uchar *arg)
+{
+  TABLE *table= (TABLE *) arg;
+  if (field->table == table || !table)
+    bitmap_set_bit(field->table->write_set, field->field_index);
+  return 0;
+}
+
+
 bool Item::check_cols(uint c)
 {
   if (c != 1)
@@ -828,7 +958,8 @@ void Item::set_name(const char *str, uint length, CHARSET_INFO *cs)
   }
   if (cs->ctype)
   {
-    uint orig_len= length;
+    const char *str_start= str;
+
     /*
       This will probably need a better implementation in the future:
       a function in CHARSET_INFO structure.
@@ -838,16 +969,20 @@ void Item::set_name(const char *str, uint length, CHARSET_INFO *cs)
       length--;
       str++;
     }
-    if (orig_len != length && !is_autogenerated_name)
+    if (str != str_start && !is_autogenerated_name)
     {
+      char buff[SAFE_NAME_LEN];
+      strmake(buff, str_start,
+              min(sizeof(buff)-1, length + (int) (str-str_start)));
+
       if (length == 0)
         push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                             ER_NAME_BECOMES_EMPTY, ER(ER_NAME_BECOMES_EMPTY),
-                            str + length - orig_len);
+                            buff);
       else
         push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                             ER_REMOVED_SPACES, ER(ER_REMOVED_SPACES),
-                            str + length - orig_len);
+                            buff);
     }
   }
   if (!my_charset_same(cs, system_charset_info))
@@ -1043,12 +1178,52 @@ bool Item_string::eq(const Item *item, bool binary_cmp) const
 
 /**
   Get the value of the function as a MYSQL_TIME structure.
-  As a extra convenience the time structure is reset on error!
+  As a extra convenience the time structure is reset on error or NULL values!
 */
 
 bool Item::get_date(MYSQL_TIME *ltime,uint fuzzydate)
 {
-  if (result_type() == STRING_RESULT)
+  if (field_type() == MYSQL_TYPE_TIME)
+    fuzzydate|= TIME_TIME_ONLY;
+
+  switch (result_type()) {
+  case INT_RESULT:
+  {
+    longlong value= val_int();
+    if (field_type() == MYSQL_TYPE_YEAR)
+    {
+      if (max_length == 2)
+      {
+        if (value < 70)
+          value+= 2000;
+        else if (value <= 1900)
+          value+= 1900;
+      }
+      value*= 10000; /* make it YYYYMMHH */
+    }
+    if (null_value || int_to_datetime_with_warn(value, ltime, fuzzydate,
+                                                field_name_or_null()))
+      goto err;
+    break;
+  }
+  case REAL_RESULT:
+  {
+    double value= val_real();
+    if (null_value || double_to_datetime_with_warn(value, ltime, fuzzydate,
+                                                   field_name_or_null()))
+      goto err;
+    break;
+  }
+  case DECIMAL_RESULT:
+  {
+    my_decimal value, *res;
+    if (!(res= val_decimal(&value)) ||
+        decimal_to_datetime_with_warn(res, ltime, fuzzydate,
+                                      field_name_or_null()))
+      goto err;
+    break;
+  }
+  case STRING_RESULT:
   {
     char buff[40];
     String tmp(buff,sizeof(buff), &my_charset_bin),*res;
@@ -1056,25 +1231,12 @@ bool Item::get_date(MYSQL_TIME *ltime,uint fuzzydate)
         str_to_datetime_with_warn(res->charset(), res->ptr(), res->length(),
                                   ltime, fuzzydate) <= MYSQL_TIMESTAMP_ERROR)
       goto err;
+    break;
   }
-  else
-  {
-    int was_cut;
-    longlong value= val_int();
-
-    if (null_value)
-      goto err;
-
-    if (number_to_datetime(value, ltime, fuzzydate, &was_cut) == LL(-1))
-    {
-      char buff[22], *end;
-      end= longlong10_to_str(value, buff, -10);
-      make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                   buff, (int) (end-buff), MYSQL_TIMESTAMP_NONE,
-                                   NullS);
-      goto err;
-    }
+  default:
+    DBUG_ASSERT(0);
   }
+
   return 0;
 
 err:
@@ -1082,23 +1244,20 @@ err:
   return 1;
 }
 
-/**
-  Get time of first argument.\
-
-  As a extra convenience the time structure is reset on error!
-*/
-
-bool Item::get_time(MYSQL_TIME *ltime)
+bool Item::get_seconds(ulonglong *sec, ulong *sec_part)
 {
-  char buff[40];
-  String tmp(buff,sizeof(buff),&my_charset_bin),*res;
-  if (!(res=val_str_ascii(&tmp)) ||
-      str_to_time_with_warn(res->charset(), res->ptr(), res->length(), ltime))
-  {
-    bzero((char*) ltime,sizeof(*ltime));
-    return 1;
+  if (result_type() == INT_RESULT)
+  { // optimize for an important special case
+    longlong val= val_int();
+    bool neg= val < 0 && !unsigned_flag;
+    *sec= neg ? -val : val;
+    *sec_part= 0;
+    return neg;
   }
-  return 0;
+  my_decimal tmp, *dec= val_decimal(&tmp);
+  if (!dec)
+    return 0;
+  return my_decimal2seconds(dec, sec, sec_part);
 }
 
 CHARSET_INFO *Item::default_charset()
@@ -1124,6 +1283,7 @@ int Item::save_in_field_no_warnings(Field *field, bool no_conversions)
   my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
   ulonglong sql_mode= thd->variables.sql_mode;
   thd->variables.sql_mode&= ~(MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE);
+  thd->variables.sql_mode|= MODE_INVALID_DATES;
   thd->count_cuted_fields= CHECK_FIELD_IGNORE;
 
   res= save_in_field(field, no_conversions);
@@ -1566,6 +1726,11 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array,
     */
     Item_aggregate_ref *item_ref;
     uint el= fields.elements;
+    /*
+      If this is an item_ref, get the original item
+      This is a safety measure if this is called for things that is
+      already a reference.
+    */
     Item *real_itm= real_item();
 
     ref_pointer_array[el]= real_itm;
@@ -1995,6 +2160,7 @@ Item_field::Item_field(Field *f)
     if this item is to be reused
   */
   orig_table_name= orig_field_name= "";
+  with_field= 1;
 }
 
 
@@ -2043,6 +2209,7 @@ Item_field::Item_field(THD *thd, Name_resolution_context *context_arg,
     name= (char*) orig_field_name;
   }
   set_field(f);
+  with_field= 1;
 }
 
 
@@ -2057,6 +2224,7 @@ Item_field::Item_field(Name_resolution_context *context_arg,
   collation.set(DERIVATION_IMPLICIT);
   if (select && select->parsing_place != IN_HAVING)
       select->select_n_where_fields++;
+  with_field= 1;
 }
 
 /**
@@ -2073,6 +2241,7 @@ Item_field::Item_field(THD *thd, Item_field *item)
    any_privileges(item->any_privileges)
 {
   collation.set(DERIVATION_IMPLICIT);
+  with_field= 1;
 }
 
 
@@ -2310,24 +2479,14 @@ bool Item_field::get_date(MYSQL_TIME *ltime,uint fuzzydate)
 
 bool Item_field::get_date_result(MYSQL_TIME *ltime,uint fuzzydate)
 {
-  if ((null_value=result_field->is_null()) ||
-      result_field->get_date(ltime,fuzzydate))
+  if (result_field->is_null() || result_field->get_date(ltime,fuzzydate))
   {
     bzero((char*) ltime,sizeof(*ltime));
-    return 1;
+    return (null_value= 1);
   }
-  return 0;
+  return (null_value= 0);
 }
 
-bool Item_field::get_time(MYSQL_TIME *ltime)
-{
-  if ((null_value=field->is_null()) || field->get_time(ltime))
-  {
-    bzero((char*) ltime,sizeof(*ltime));
-    return 1;
-  }
-  return 0;
-}
 
 void Item_field::save_result(Field *to)
 {
@@ -2377,10 +2536,12 @@ bool Item_field::val_bool_result()
   case STRING_RESULT:
     return result_field->val_real() != 0.0;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
     return 0;                                   // Shut up compiler
   }
+  return 0;
 }
 
 
@@ -2424,13 +2585,17 @@ table_map Item_field::used_tables() const
 {
   if (field->table->const_table)
     return 0;					// const item
-  return (depended_from ? OUTER_REF_TABLE_BIT : field->table->map);
+  return (get_depended_from() ? OUTER_REF_TABLE_BIT : field->table->map);
 }
 
+table_map Item_field::all_used_tables() const
+{
+  return (get_depended_from() ? OUTER_REF_TABLE_BIT : field->table->map);
+}
 
 void Item_field::fix_after_pullout(st_select_lex *new_parent, Item **ref)
 {
-  if (new_parent == depended_from)
+  if (new_parent == get_depended_from())
     depended_from= NULL;
   Name_resolution_context *ctx= new Name_resolution_context();
   ctx->outer_context= NULL; // We don't build a complete name resolver
@@ -2742,19 +2907,20 @@ void Item_string::print(String *str, enum_query_type query_type)
 
 
 double 
-double_from_string_with_check (CHARSET_INFO *cs, const char *cptr, char *end)
+double_from_string_with_check(CHARSET_INFO *cs, const char *cptr,
+                              const char *end)
 {
   int error;
-  char *org_end;
+  char *end_of_num= (char*) end;
   double tmp;
 
-  org_end= end;
-  tmp= my_strntod(cs, (char*) cptr, end - cptr, &end, &error);
-  if (error || (end != org_end && !check_if_only_end_space(cs, end, org_end)))
+  tmp= my_strntod(cs, (char*) cptr, end - cptr, &end_of_num, &error);
+  if (error || (end != end_of_num &&
+                !check_if_only_end_space(cs, end_of_num, end)))
   {
-    ErrConvString err(cptr, cs);
+    ErrConvString err(cptr, end - cptr, cs);
     /*
-      We can use str_value.ptr() here as Item_string is gurantee to put an
+      We can use err.ptr() here as ErrConvString is guranteed to put an
       end \0 here.
     */
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
@@ -2769,28 +2935,31 @@ double_from_string_with_check (CHARSET_INFO *cs, const char *cptr, char *end)
 double Item_string::val_real()
 {
   DBUG_ASSERT(fixed == 1);
-  return double_from_string_with_check (str_value.charset(), str_value.ptr(), 
-                                        (char *) str_value.ptr() + str_value.length());
+  return double_from_string_with_check(str_value.charset(),
+                                       str_value.ptr(), 
+                                       str_value.ptr() +
+                                       str_value.length());
 }
 
 
 longlong 
-longlong_from_string_with_check (CHARSET_INFO *cs, const char *cptr, char *end)
+longlong_from_string_with_check(CHARSET_INFO *cs, const char *cptr,
+                                const char *end)
 {
   int err;
   longlong tmp;
-  char *org_end= end;
+  char *end_of_num= (char*) end;
 
-  tmp= (*(cs->cset->strtoll10))(cs, cptr, &end, &err);
+  tmp= (*(cs->cset->strtoll10))(cs, cptr, &end_of_num, &err);
   /*
     TODO: Give error if we wanted a signed integer and we got an unsigned
     one
   */
   if (!current_thd->no_errors &&
       (err > 0 ||
-       (end != org_end && !check_if_only_end_space(cs, end, org_end))))
+       (end != end_of_num && !check_if_only_end_space(cs, end_of_num, end))))
   {
-    ErrConvString err(cptr, cs);
+    ErrConvString err(cptr, end - cptr, cs);
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                         ER_TRUNCATED_WRONG_VALUE,
                         ER(ER_TRUNCATED_WRONG_VALUE), "INTEGER",
@@ -2808,7 +2977,7 @@ longlong Item_string::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   return longlong_from_string_with_check(str_value.charset(), str_value.ptr(),
-                             (char *) str_value.ptr()+ str_value.length());
+                                         str_value.ptr()+ str_value.length());
 }
 
 
@@ -3000,19 +3169,19 @@ void Item_param::set_time(MYSQL_TIME *tm, timestamp_type time_type,
   if (value.time.year > 9999 || value.time.month > 12 ||
       value.time.day > 31 ||
       (time_type != MYSQL_TIMESTAMP_TIME && value.time.hour > 23) ||
-      value.time.minute > 59 || value.time.second > 59)
+      value.time.minute > 59 || value.time.second > 59 ||
+      value.time.second_part > TIME_MAX_SECOND_PART)
   {
-    char buff[MAX_DATE_STRING_REP_LENGTH];
-    uint length= my_TIME_to_str(&value.time, buff);
+    ErrConvTime str(&value.time);
     make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                 buff, length, time_type, 0);
+                                 &str, time_type, 0);
     set_zero_time(&value.time, MYSQL_TIMESTAMP_ERROR);
   }
 
   state= TIME_VALUE;
   maybe_null= 0;
   max_length= max_length_arg;
-  decimals= 0;
+  decimals= tm->second_part > 0 ? TIME_SECOND_PART_DIGITS : 0;
   DBUG_VOID_RETURN;
 }
 
@@ -3099,10 +3268,12 @@ bool Item_param::set_from_user_var(THD *thd, const user_var_entry *entry)
     case REAL_RESULT:
       set_double(*(double*)entry->value);
       item_type= Item::REAL_ITEM;
+      param_type= MYSQL_TYPE_DOUBLE;
       break;
     case INT_RESULT:
       set_int(*(longlong*)entry->value, MY_INT64_NUM_DECIMAL_DIGITS);
       item_type= Item::INT_ITEM;
+      param_type= MYSQL_TYPE_LONGLONG;
       break;
     case STRING_RESULT:
     {
@@ -3125,6 +3296,7 @@ bool Item_param::set_from_user_var(THD *thd, const user_var_entry *entry)
         charset of connection, so we have to set it later.
       */
       item_type= Item::STRING_ITEM;
+      param_type= MYSQL_TYPE_VARCHAR;
 
       if (set_str((const char *)entry->value, entry->length))
         DBUG_RETURN(1);
@@ -3140,9 +3312,12 @@ bool Item_param::set_from_user_var(THD *thd, const user_var_entry *entry)
         my_decimal_precision_to_length_no_truncation(ent_value->precision(),
                                                      decimals, unsigned_flag);
       item_type= Item::DECIMAL_ITEM;
+      param_type= MYSQL_TYPE_NEWDECIMAL;
       break;
     }
-    default:
+    case ROW_RESULT:
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
       DBUG_ASSERT(0);
       set_null();
     }
@@ -3204,7 +3379,7 @@ int Item_param::save_in_field(Field *field, bool no_conversions)
   case DECIMAL_VALUE:
     return field->store_decimal(&decimal_value);
   case TIME_VALUE:
-    field->store_time(&value.time, value.time.time_type);
+    field->store_time_dec(&value.time, decimals);
     return 0;
   case STRING_VALUE:
   case LONG_DATA_VALUE:
@@ -3220,21 +3395,6 @@ int Item_param::save_in_field(Field *field, bool no_conversions)
 }
 
 
-bool Item_param::get_time(MYSQL_TIME *res)
-{
-  if (state == TIME_VALUE)
-  {
-    *res= value.time;
-    return 0;
-  }
-  /*
-    If parameter value isn't supplied assertion will fire in val_str()
-    which is called from Item::get_time().
-  */
-  return Item::get_time(res);
-}
-
-
 bool Item_param::get_date(MYSQL_TIME *res, uint fuzzydate)
 {
   if (state == TIME_VALUE)
@@ -3364,7 +3524,8 @@ String *Item_param::val_str(String* str)
   {
     if (str->reserve(MAX_DATE_STRING_REP_LENGTH))
       break;
-    str->length((uint) my_TIME_to_str(&value.time, (char*) str->ptr()));
+    str->length((uint) my_TIME_to_str(&value.time, (char*) str->ptr(),
+                decimals));
     str->set_charset(&my_charset_bin);
     return str;
   }
@@ -3416,7 +3577,7 @@ const String *Item_param::query_val_str(String* str) const
       buf= str->c_ptr_quick();
       ptr= buf;
       *ptr++= '\'';
-      ptr+= (uint) my_TIME_to_str(&value.time, ptr);
+      ptr+= (uint) my_TIME_to_str(&value.time, ptr, decimals);
       *ptr++= '\'';
       str->length((uint32) (ptr - buf));
       break;
@@ -3759,6 +3920,7 @@ void Item_param::make_field(Send_field *field)
 /****************************************************************************
   Item_copy
 ****************************************************************************/
+
 Item_copy *Item_copy::create (Item *item)
 {
   switch (item->result_type())
@@ -3772,7 +3934,9 @@ Item_copy *Item_copy::create (Item *item)
         new Item_copy_uint (item) : new Item_copy_int (item);
     case DECIMAL_RESULT:
       return new Item_copy_decimal (item);
-    default:
+    case TIME_RESULT:
+    case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
       DBUG_ASSERT (0);
   }
   /* should not happen */
@@ -3845,8 +4009,7 @@ void Item_copy_int::copy()
   null_value=item->null_value;
 }
 
-static int save_int_value_in_field (Field *field, longlong nr, 
-                                    bool null_value, bool unsigned_flag);
+static int save_int_value_in_field (Field *, longlong, bool, bool);
 
 int Item_copy_int::save_in_field(Field *field, bool no_conversions)
 {
@@ -4379,6 +4542,34 @@ resolve_ref_in_select_and_group(THD *thd, Item_ident *ref, SELECT_LEX *select)
 }
 
 
+/*
+  @brief
+  Whether a table belongs to an outer select.
+
+  @param table table to check
+  @param select current select
+
+  @details
+  Try to find select the table belongs to by ascending the derived tables chain.
+*/
+
+static
+bool is_outer_table(TABLE_LIST *table, SELECT_LEX *select)
+{
+  DBUG_ASSERT(table->select_lex != select);
+  TABLE_LIST *tl;
+
+  for (tl= select->master_unit()->derived;
+       tl && tl->is_merged_derived();
+       select= tl->select_lex, tl= select->master_unit()->derived)
+  {
+    if (tl->select_lex == table->select_lex)
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
 /**
   Resolve the name of an outer select column reference.
 
@@ -4547,9 +4738,6 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
                             ((ref_type == REF_ITEM || ref_type == FIELD_ITEM) ?
                              (Item_ident*) (*reference) :
                              0));
-          context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
           /*
             A reference to a view field had been found and we
             substituted it instead of this Item (find_field_in_tables
@@ -4650,9 +4838,6 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
     mark_as_dependent(thd, last_checked_context->select_lex,
                       context->select_lex, rf,
                       rf);
-    context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
 
     return 0;
   }
@@ -4661,9 +4846,6 @@ Item_field::fix_outer_field(THD *thd, Field **from_field, Item **reference)
     mark_as_dependent(thd, last_checked_context->select_lex,
                       context->select_lex,
                       this, (Item_ident*)*reference);
-    context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
     if (last_checked_context->select_lex->having_fix_field)
     {
       Item_ref *rf;
@@ -4827,7 +5009,8 @@ bool Item_field::fix_fields(THD *thd, Item **reference)
 
     if (!outer_fixed && cached_table && cached_table->select_lex &&
         context->select_lex &&
-        cached_table->select_lex != context->select_lex)
+        cached_table->select_lex != context->select_lex &&
+        is_outer_table(cached_table, context->select_lex))
     {
       int ret;
       if ((ret= fix_outer_field(thd, &from_field, reference)) < 0)
@@ -5022,13 +5205,14 @@ Item_equal *Item_field::find_item_equal(COND_EQUAL *cond_equal)
 
 
 /**
-  Check whether a field can be substituted by an equal item.
+  Check whether a field item can be substituted for an equal item
 
-  The function checks whether a substitution of the field
-  occurrence for an equal item is valid.
+  @details
+  The function checks whether a substitution of a field item for
+  an equal item is valid.
 
-  @param arg   *arg != NULL <-> the field is in the context where
-               substitution for an equal item is valid
+  @param arg   *arg != NULL <-> the field is in the context
+               where substitution for an equal item is valid
 
   @note
     The following statement is not always true:
@@ -5053,7 +5237,10 @@ Item_equal *Item_field::find_item_equal(COND_EQUAL *cond_equal)
 
 bool Item_field::subst_argument_checker(uchar **arg)
 {
-  return (result_type() != STRING_RESULT) || (*arg);
+  return *arg &&
+         (*arg == (uchar *) Item::ANY_SUBST ||
+          result_type() != STRING_RESULT || 
+          (field->flags & BINARY_FLAG));
 }
 
 
@@ -5131,12 +5318,7 @@ Item *Item_field::equal_fields_propagator(uchar *arg)
     item= this;
   else if (field && (field->flags & ZEROFILL_FLAG) && IS_NUM(field->type()))
   {
-    /*
-      We don't need to zero-fill timestamp columns here because they will be 
-      first converted to a string (in date/time format) and compared as such if
-      compared with another string.
-    */
-    if (item && field->type() != FIELD_TYPE_TIMESTAMP && cmp_context != INT_RESULT)
+    if (item && (cmp_context == STRING_RESULT || cmp_context == IMPOSSIBLE_RESULT))
       convert_zerofill_number_to_string(&item, (Field_num *)field);
     else
       item= this;
@@ -5163,7 +5345,8 @@ bool Item_field::set_no_const_sub(uchar *arg)
   Replace an Item_field for an equal Item_field that evaluated earlier
   (if any).
 
-  The function returns a pointer to an item that is taken from
+  If this->item_equal points to some item and coincides with arg then
+  the function returns a pointer to an item that is taken from
   the very beginning of the item_equal list which the Item_field
   object refers to (belongs to) unless item_equal contains  a constant
   item. In this case the function returns this constant item, 
@@ -5171,12 +5354,12 @@ bool Item_field::set_no_const_sub(uchar *arg)
   If the Item_field object does not refer any Item_equal object
   'this' is returned .
 
-  @param arg   a dummy parameter, is not used here
+  @param arg   NULL or points to so some item of the Item_equal type  
 
 
   @note
     This function is supposed to be called as a callback parameter in calls
-    of the thransformer method.
+    of the transformer method.
 
   @return
     - pointer to a replacement Item_field if there is a better equal item or
@@ -5186,7 +5369,7 @@ bool Item_field::set_no_const_sub(uchar *arg)
 
 Item *Item_field::replace_equal_field(uchar *arg)
 {
-  if (item_equal)
+  if (item_equal && item_equal == (Item_equal *) arg)
   {
     Item *const_item= item_equal->get_const();
     if (const_item)
@@ -5195,8 +5378,10 @@ Item *Item_field::replace_equal_field(uchar *arg)
         return this;
       return const_item;
     }
-    Item_field *subst= item_equal->get_first(this);
-    if (subst && field->table != subst->field->table && !field->eq(subst->field))
+    Item_field *subst= (Item_field *)(item_equal->get_first(this));
+    if (subst)
+      subst= (Item_field *) (subst->real_item());
+    if (subst && !field->eq(subst->field))
       return subst;
   }
   return this;
@@ -5254,10 +5439,12 @@ enum_field_types Item::field_type() const
   case DECIMAL_RESULT: return MYSQL_TYPE_NEWDECIMAL;
   case REAL_RESULT:    return MYSQL_TYPE_DOUBLE;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
     return MYSQL_TYPE_VARCHAR;
   }
+  return MYSQL_TYPE_VARCHAR;
 }
 
 
@@ -5434,16 +5621,19 @@ Field *Item::tmp_table_field_from_field_type(TABLE *table, bool fixed_length)
     break;
   case MYSQL_TYPE_NEWDATE:
   case MYSQL_TYPE_DATE:
-    field= new Field_newdate(maybe_null, name, &my_charset_bin);
+    field= new Field_newdate(0, null_ptr, 0, Field::NONE, name, &my_charset_bin);
     break;
   case MYSQL_TYPE_TIME:
-    field= new Field_time(maybe_null, name, &my_charset_bin);
+    field= new_Field_time(0, null_ptr, 0, Field::NONE, name,
+                              decimals, &my_charset_bin);
     break;
   case MYSQL_TYPE_TIMESTAMP:
-    field= new Field_timestamp(maybe_null, name, &my_charset_bin);
+    field= new_Field_timestamp(0, null_ptr, 0,
+                               Field::NONE, name, 0, decimals, &my_charset_bin);
     break;
   case MYSQL_TYPE_DATETIME:
-    field= new Field_datetime(maybe_null, name, &my_charset_bin);
+    field= new_Field_datetime(0, null_ptr, 0, Field::NONE, name,
+                              decimals, &my_charset_bin);
     break;
   case MYSQL_TYPE_YEAR:
     field= new Field_year((uchar*) 0, max_length, null_ptr, 0, Field::NONE,
@@ -5615,7 +5805,7 @@ int Item_null::save_safe_in_field(Field *field)
   Item uses str_value to store something, it should
   reimplement it's ::save_in_field() as Item_string, for example, does.
 
-  Note: all Item_XXX::val_str(str) methods must NOT rely on the fact that
+  Note: all Item_XXX::val_str(str) methods must NOT assume that
   str != str_value. For example, see fix for bug #44743.
 */
 
@@ -5641,15 +5831,6 @@ int Item::save_in_field(Field *field, bool no_conversions)
     error=field->store(result->ptr(),result->length(),cs);
     str_value.set_quick(0, 0, cs);
   }
-  else if (result_type() == REAL_RESULT &&
-           field->result_type() == STRING_RESULT)
-  {
-    double nr= val_real();
-    if (null_value)
-      return set_field_to_null_with_conversions(field, no_conversions);
-    field->set_notnull();
-    error= field->store(nr);
-  }
   else if (result_type() == REAL_RESULT)
   {
     double nr= val_real();
@@ -5687,12 +5868,6 @@ int Item_string::save_in_field(Field *field, bool no_conversions)
 }
 
 
-int Item_uint::save_in_field(Field *field, bool no_conversions)
-{
-  /* Item_int::save_in_field handles both signed and unsigned. */
-  return Item_int::save_in_field(field, no_conversions);
-}
-
 static int save_int_value_in_field (Field *field, longlong nr, 
                                     bool null_value, bool unsigned_flag)
 {
@@ -5709,6 +5884,22 @@ int Item_int::save_in_field(Field *field, bool no_conversions)
 }
 
 
+void Item_datetime::set(longlong packed)
+{
+  unpack_time(packed, &ltime);
+}
+
+int Item_datetime::save_in_field(Field *field, bool no_conversions)
+{
+  field->set_notnull();
+  return field->store_time_dec(&ltime, decimals);
+}
+
+longlong Item_datetime::val_int()
+{
+  return TIME_to_ulonglong(&ltime);
+}
+
 int Item_decimal::save_in_field(Field *field, bool no_conversions)
 {
   field->set_notnull();
@@ -5726,7 +5917,9 @@ bool Item_int::eq(const Item *arg, bool binary_cmp) const
       a basic constant.
     */
     Item *item= (Item*) arg;
-    return item->val_int() == value && item->unsigned_flag == unsigned_flag;
+    return (item->val_int() == value &&
+            ((longlong) value >= 0 ||
+             (item->unsigned_flag == unsigned_flag)));
   }
   return FALSE;
 }
@@ -6087,7 +6280,10 @@ bool Item::send(Protocol *protocol, String *buffer)
   {
     String *res;
     if ((res=val_str(buffer)))
+    {
+      DBUG_ASSERT(!null_value);
       result= protocol->store(res->ptr(),res->length(),res->charset());
+    }
     else
     {
       DBUG_ASSERT(null_value);
@@ -6154,7 +6350,7 @@ bool Item::send(Protocol *protocol, String *buffer)
       if (f_type == MYSQL_TYPE_DATE)
 	return protocol->store_date(&tm);
       else
-	result= protocol->store(&tm);
+	result= protocol->store(&tm, decimals);
     }
     break;
   }
@@ -6163,7 +6359,7 @@ bool Item::send(Protocol *protocol, String *buffer)
     MYSQL_TIME tm;
     get_time(&tm);
     if (!null_value)
-      result= protocol->store_time(&tm);
+      result= protocol->store_time(&tm, decimals);
     break;
   }
   }
@@ -6306,17 +6502,7 @@ void Item_field::print(String *str, enum_query_type query_type)
 {
   if (field && field->table->const_table)
   {
-    char buff[MAX_FIELD_WIDTH];
-    String tmp(buff,sizeof(buff),str->charset());
-    field->val_str(&tmp);
-    if (field->is_null())
-      str->append("NULL");
-    else
-    {
-      str->append('\'');
-      str->append(tmp);
-      str->append('\'');
-    }
+    print_value(str);
     return;
   }
   Item_ident::print(str, query_type);
@@ -6328,7 +6514,7 @@ Item_ref::Item_ref(Name_resolution_context *context_arg,
                    const char *field_name_arg,
                    bool alias_name_used_arg)
   :Item_ident(context_arg, NullS, table_name_arg, field_name_arg),
-   result_field(0), ref(item)
+   result_field(0), ref(item), reference_trough_name(0)
 {
   alias_name_used= alias_name_used_arg;
   /*
@@ -6354,8 +6540,9 @@ public:
     st_select_lex *sel;
     for (sel= current_select; sel; sel= sel->outer_select())
     {
+      List_iterator<TABLE_LIST> li(sel->leaf_tables);
       TABLE_LIST *tbl;
-      for (tbl= sel->leaf_tables; tbl; tbl= tbl->next_leaf)
+      while ((tbl= li++))
       {
         if (tbl->table == item->field->table)
         {
@@ -6371,7 +6558,7 @@ public:
 Item_ref::Item_ref(TABLE_LIST *view_arg, Item **item,
                    const char *field_name_arg, bool alias_name_used_arg)
   :Item_ident(view_arg, field_name_arg),
-   result_field(NULL), ref(item)
+   result_field(NULL), ref(item), reference_trough_name(0)
 {
   alias_name_used= alias_name_used_arg;
   /*
@@ -6454,6 +6641,7 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
 
   if (!ref || ref == not_found_item)
   {
+    DBUG_ASSERT(reference_trough_name != 0);
     if (!(ref= resolve_ref_in_select_and_group(thd, this,
                                                context->select_lex)))
       goto error;             /* Some error occurred (e.g. ambiguous names). */
@@ -6555,9 +6743,6 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
                                 refer_type == FIELD_ITEM) ?
                                (Item_ident*) (*reference) :
                                0));
-           context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
             /*
               view reference found, we substituted it instead of this
               Item, so can quit
@@ -6608,9 +6793,6 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
         thd->change_item_tree(reference, fld);
         mark_as_dependent(thd, last_checked_context->select_lex,
                           thd->lex->current_select, fld, fld);
-        context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
         /*
           A reference is resolved to a nest level that's outer or the same as
           the nest level of the enclosing set function : adjust the value of
@@ -6634,9 +6816,6 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
       DBUG_ASSERT(*ref && (*ref)->fixed);
       mark_as_dependent(thd, last_checked_context->select_lex,
                         context->select_lex, this, this);
-      context->select_lex->
-              register_dependency_item(last_checked_context->select_lex,
-                                       reference);
       /*
         A reference is resolved to a nest level that's outer or the same as
         the nest level of the enclosing set function : adjust the value of
@@ -6649,13 +6828,8 @@ bool Item_ref::fix_fields(THD *thd, Item **reference)
                       last_checked_context->select_lex->nest_level);
     }
   }
-  else
+  else if (ref_type() != VIEW_REF)
   {
-    if (depended_from && reference)
-    {
-      DBUG_ASSERT(context->select_lex != depended_from);
-      context->select_lex->register_dependency_item(depended_from, reference);
-    }
     /*
       It could be that we're referring to something that's in ancestor selects.
       We must make an appropriate mark_as_dependent() call for each such
@@ -6712,6 +6886,7 @@ void Item_ref::set_properties()
     split_sum_func() doesn't try to change the reference.
   */
   with_sum_func= (*ref)->with_sum_func;
+  with_field= (*ref)->with_field;
   unsigned_flag= (*ref)->unsigned_flag;
   fixed= 1;
   if (alias_name_used)
@@ -6728,10 +6903,100 @@ void Item_ref::cleanup()
   DBUG_ENTER("Item_ref::cleanup");
   Item_ident::cleanup();
   result_field= 0;
+  if (reference_trough_name)
+  {
+    /* We have to reset the reference as it may been freed */
+    ref= 0;
+  }
   DBUG_VOID_RETURN;
 }
 
 
+/**
+  Transform an Item_ref object with a transformer callback function.
+
+  The function first applies the transform method to the item
+  referenced by this Item_reg object. If this returns a new item the
+  old item is substituted for a new one. After this the transformer
+  is applied to the Item_ref object.
+
+  @param transformer   the transformer callback function to be applied to
+                       the nodes of the tree of the object
+  @param argument      parameter to be passed to the transformer
+
+  @return Item returned as the result of transformation of the Item_ref object
+    @retval !NULL The transformation was successful
+    @retval NULL  Out of memory error
+*/
+
+Item* Item_ref::transform(Item_transformer transformer, uchar *arg)
+{
+  DBUG_ASSERT(!current_thd->stmt_arena->is_stmt_prepare());
+  DBUG_ASSERT((*ref) != NULL);
+
+  /* Transform the object we are referencing. */
+  Item *new_item= (*ref)->transform(transformer, arg);
+  if (!new_item)
+    return NULL;
+
+  /*
+    THD::change_item_tree() should be called only if the tree was
+    really transformed, i.e. when a new item has been created.
+    Otherwise we'll be allocating a lot of unnecessary memory for
+    change records at each execution.
+  */
+  if (*ref != new_item)
+    current_thd->change_item_tree(ref, new_item);
+
+  /* Transform the item ref object. */
+  return (this->*transformer)(arg);
+}
+
+
+/**
+  Compile an Item_ref object with a processor and a transformer
+  callback functions.
+
+  First the function applies the analyzer to the Item_ref object. Then
+  if the analizer succeeeds we first applies the compile method to the
+  object the Item_ref object is referencing. If this returns a new
+  item the old item is substituted for a new one.  After this the
+  transformer is applied to the Item_ref object itself.
+  The compile function is not called if the analyzer returns NULL
+  in the parameter arg_p. 
+
+  @param analyzer      the analyzer callback function to be applied to the
+                       nodes of the tree of the object
+  @param[in,out] arg_p parameter to be passed to the processor
+  @param transformer   the transformer callback function to be applied to the
+                       nodes of the tree of the object
+  @param arg_t         parameter to be passed to the transformer
+
+  @return Item returned as the result of transformation of the Item_ref object
+*/
+
+Item* Item_ref::compile(Item_analyzer analyzer, uchar **arg_p,
+                        Item_transformer transformer, uchar *arg_t)
+{
+  /* Analyze this Item object. */
+  if (!(this->*analyzer)(arg_p))
+    return NULL;
+
+  /* Compile the Item we are referencing. */
+  DBUG_ASSERT((*ref) != NULL);
+  if (*arg_p)
+  {
+    uchar *arg_v= *arg_p;
+    Item *new_item= (*ref)->compile(analyzer, &arg_v, transformer, arg_t);
+    if (new_item && *ref != new_item)
+      current_thd->change_item_tree(ref, new_item);
+  }
+
+  /* Transform this Item object. */
+  return (this->*transformer)(arg_t);
+}
+
+
 void Item_ref::print(String *str, enum_query_type query_type)
 {
   if (ref)
@@ -6838,7 +7103,8 @@ bool Item_ref::val_bool_result()
     case STRING_RESULT:
       return result_field->val_real() != 0.0;
     case ROW_RESULT:
-    default:
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
       DBUG_ASSERT(0);
     }
   }
@@ -6926,7 +7192,19 @@ my_decimal *Item_ref::val_decimal(my_decimal *decimal_value)
 int Item_ref::save_in_field(Field *to, bool no_conversions)
 {
   int res;
-  DBUG_ASSERT(!result_field);
+  if (result_field)
+  {
+    if (result_field->is_null())
+    {
+      null_value= 1;
+      res= set_field_to_null_with_conversions(to, no_conversions);
+      return res;
+    }
+    to->set_notnull();
+    res= field_conv(to, result_field);
+    null_value= 0;
+    return res;
+  }
   res= (*ref)->save_in_field(to, no_conversions);
   null_value= (*ref)->null_value;
   return res;
@@ -7043,8 +7321,7 @@ bool Item_direct_ref::get_date(MYSQL_TIME *ltime,uint fuzzydate)
 
 Item_cache_wrapper::~Item_cache_wrapper()
 {
-  delete expr_cache;
-  /* expr_value is Item so it will be destroyed from list of Items */
+  DBUG_ASSERT(expr_cache == 0);
 }
 
 Item_cache_wrapper::Item_cache_wrapper(Item *item_arg)
@@ -7056,9 +7333,11 @@ Item_cache_wrapper::Item_cache_wrapper(Item *item_arg)
   decimals=   orig_item->decimals;
   collation.set(orig_item->collation);
   with_sum_func= orig_item->with_sum_func;
+  with_field= orig_item->with_field;
   unsigned_flag= orig_item->unsigned_flag;
   name= item_arg->name;
   name_length= item_arg->name_length;
+  with_subselect=  orig_item->with_subselect;
 
   if ((expr_value= Item_cache::get_cache(orig_item)))
     expr_value->setup(orig_item);
@@ -7067,11 +7346,28 @@ Item_cache_wrapper::Item_cache_wrapper(Item *item_arg)
 }
 
 
+/**
+  Initialize the cache if it is needed
+*/
+
+void Item_cache_wrapper::init_on_demand()
+{
+    if (!expr_cache->is_inited())
+    {
+      orig_item->get_cache_parameters(parameters);
+      expr_cache->init();
+    }
+}
+
+
 void Item_cache_wrapper::print(String *str, enum_query_type query_type)
 {
   str->append(func_name());
   if (expr_cache)
+  {
+    init_on_demand();
     expr_cache->print(str, query_type);
+  }
   else
     str->append(STRING_WITH_LEN("<<DISABLED>>"));
   str->append('(');
@@ -7107,10 +7403,14 @@ bool Item_cache_wrapper::send(Protocol *protocol, String *buffer)
 
 void Item_cache_wrapper::cleanup()
 {
+  DBUG_ENTER("Item_cache_wrapper::cleanup");
+  Item_result_field::cleanup();
   delete expr_cache;
   expr_cache= 0;
-  // expr_value is Item so it will be destroyed from list of Items
+  /* expr_value is Item so it will be destroyed from list of Items */
   expr_value= 0;
+  parameters.empty();
+  DBUG_VOID_RETURN;
 }
 
 
@@ -7130,10 +7430,11 @@ void Item_cache_wrapper::cleanup()
   @retval TRUE  Error
 */
 
-bool Item_cache_wrapper::set_cache(THD *thd, List<Item*> &depends_on)
+bool Item_cache_wrapper::set_cache(THD *thd)
 {
   DBUG_ENTER("Item_cache_wrapper::set_cache");
-  expr_cache= new Expression_cache_tmptable(thd, depends_on, expr_value);
+  DBUG_ASSERT(expr_cache == 0);
+  expr_cache= new Expression_cache_tmptable(thd, parameters, expr_value);
   DBUG_RETURN(expr_cache == NULL);
 }
 
@@ -7158,6 +7459,7 @@ Item *Item_cache_wrapper::check_cache()
   {
     Expression_cache_tmptable::result res;
     Item *cached_value;
+    init_on_demand();
     res= expr_cache->check_value(&cached_value);
     if (res == Expression_cache_tmptable::HIT)
       DBUG_RETURN(cached_value);
@@ -7388,25 +7690,6 @@ bool Item_cache_wrapper::get_date(MYSQL_TIME *ltime, uint fuzzydate)
 }
 
 
-/**
-  Get the time value of the possibly cached item
-*/
-
-bool Item_cache_wrapper::get_time(MYSQL_TIME *ltime)
-{
-  Item *cached_value;
-  DBUG_ENTER("Item_cache_wrapper::get_time");
-  if (!expr_cache)
-    DBUG_RETURN((null_value= orig_item->get_time(ltime)));
-
-  if ((cached_value= check_cache()))
-    DBUG_RETURN((null_value= cached_value->get_time(ltime)));
-
-  cache();
-  DBUG_RETURN((null_value= expr_value->get_time(ltime)));
-}
-
-
 int Item_cache_wrapper::save_in_field(Field *to, bool no_conversions)
 {
   int res;
@@ -7495,7 +7778,7 @@ bool Item_outer_ref::fix_fields(THD *thd, Item **reference)
 
 void Item_outer_ref::fix_after_pullout(st_select_lex *new_parent, Item **ref)
 {
-  if (depended_from == new_parent)
+  if (get_depended_from() == new_parent)
   {
     *ref= outer_ref;
     (*ref)->fix_after_pullout(new_parent, ref);
@@ -7505,7 +7788,7 @@ void Item_outer_ref::fix_after_pullout(st_select_lex *new_parent, Item **ref)
 void Item_ref::fix_after_pullout(st_select_lex *new_parent, Item **refptr)
 {
   (*ref)->fix_after_pullout(new_parent, ref);
-  if (depended_from == new_parent)
+  if (get_depended_from() == new_parent)
     depended_from= NULL;
 }
 
@@ -7577,6 +7860,138 @@ bool Item_direct_view_ref::eq(const Item *item, bool binary_cmp) const
   return FALSE;
 }
 
+
+Item_equal *Item_direct_view_ref::find_item_equal(COND_EQUAL *cond_equal)
+{
+  Item* field_item= real_item();
+  if (field_item->type() != FIELD_ITEM)
+    return NULL;
+  return ((Item_field *) field_item)->find_item_equal(cond_equal);  
+}
+
+
+/**
+  Check whether a reference to field item can be substituted for an equal item
+
+  @details
+  The function checks whether a substitution of a reference to field item for
+  an equal item is valid.
+
+  @param arg   *arg != NULL <-> the reference is in the context
+               where substitution for an equal item is valid
+
+  @note
+    See also the note for Item_field::subst_argument_checker
+
+  @retval
+    TRUE   substitution is valid
+  @retval
+    FALSE  otherwise
+*/
+bool Item_direct_view_ref::subst_argument_checker(uchar **arg)
+{
+  bool res= FALSE;
+  if (*arg)
+  { 
+    Item *item= real_item();
+    if (item->type() == FIELD_ITEM &&
+        (*arg == (uchar *) Item::ANY_SUBST || 
+         result_type() != STRING_RESULT ||
+         (((Item_field *) item)->field->flags & BINARY_FLAG)))
+      res= TRUE;
+  }
+  /* Block any substitution into the wrapped object */
+  if (*arg)
+    *arg= NULL; 
+  return res; 
+}
+
+
+/**
+  Set a pointer to the multiple equality the view field reference belongs to
+  (if any).
+
+  @details
+  The function looks for a multiple equality containing this item of the type
+  Item_direct_view_ref among those referenced by arg.
+  In the case such equality exists the function does the following.
+  If the found multiple equality contains a constant, then the item
+  is substituted for this constant, otherwise the function sets a pointer
+  to the multiple equality in the item.
+
+  @param arg    reference to list of multiple equalities where
+                the item (this object) is to be looked for
+
+  @note
+    This function is supposed to be called as a callback parameter in calls
+    of the compile method.
+
+  @note 
+    The function calls Item_field::equal_fields_propagator for the field item
+    this->real_item() to do the job. Then it takes the pointer to equal_item
+    from this field item and assigns it to this->item_equal.
+
+  @return
+    - pointer to the replacing constant item, if the field item was substituted
+    - pointer to the field item, otherwise.
+*/
+
+Item *Item_direct_view_ref::equal_fields_propagator(uchar *arg)
+{
+  Item *field_item= real_item();
+  if (field_item->type() != FIELD_ITEM)
+    return this;
+  Item *item= field_item->equal_fields_propagator(arg);
+  set_item_equal(field_item->get_item_equal());
+  field_item->set_item_equal(NULL);
+  if (item != field_item)
+    return item;
+  return this;
+}
+
+
+/**
+  Replace an Item_direct_view_ref for an equal Item_field evaluated earlier
+  (if any).
+
+  @details
+  If this->item_equal points to some item and coincides with arg then
+  the function returns a pointer to a field item that is referred to by the 
+  first element of the item_equal list which the Item_direct_view_ref
+  object belongs to unless item_equal contains  a constant item. In this
+  case the function returns this constant item (if the substitution does
+   not require conversion).   
+  If the Item_direct_view_item object does not refer any Item_equal object
+  'this' is returned .
+
+  @param arg   NULL or points to so some item of the Item_equal type  
+
+  @note
+    This function is supposed to be called as a callback parameter in calls
+    of the transformer method.
+
+  @note 
+    The function calls Item_field::replace_equal_field for the field item
+    this->real_item() to do the job.
+
+  @return
+    - pointer to a replacement Item_field if there is a better equal item or
+      a pointer to a constant equal item;
+    - this - otherwise.
+*/
+
+Item *Item_direct_view_ref::replace_equal_field(uchar *arg)
+{
+  Item *field_item= real_item();
+  if (field_item->type() != FIELD_ITEM)
+    return this;
+  field_item->set_item_equal(item_equal);
+  Item *item= field_item->replace_equal_field(arg);
+  field_item->set_item_equal(0);
+  return item != field_item ? item : this;
+}
+
+
 bool Item_default_value::eq(const Item *item, bool binary_cmp) const
 {
   return item->type() == DEFAULT_VALUE_ITEM && 
@@ -7949,6 +8364,8 @@ Item_result item_cmp_type(Item_result a,Item_result b)
     return INT_RESULT;
   else if (a == ROW_RESULT || b == ROW_RESULT)
     return ROW_RESULT;
+  else if (a == TIME_RESULT || b == TIME_RESULT)
+    return TIME_RESULT;
   if ((a == INT_RESULT || a == DECIMAL_RESULT) &&
       (b == INT_RESULT || b == DECIMAL_RESULT))
     return DECIMAL_RESULT;
@@ -7962,11 +8379,20 @@ void resolve_const_item(THD *thd, Item **ref, Item *comp_item)
   Item *new_item= NULL;
   if (item->basic_const_item())
     return;                                     // Can't be better
-  Item_result res_type=item_cmp_type(comp_item->result_type(),
-				     item->result_type());
+  Item_result res_type=item_cmp_type(comp_item->cmp_type(), item->cmp_type());
   char *name=item->name;			// Alloced by sql_alloc
 
   switch (res_type) {
+  case TIME_RESULT:
+  {
+    bool is_null;
+    Item **ref_copy= ref;
+    /* the following call creates a constant and puts it in new_item */
+    get_datetime_value(thd, &ref_copy, &new_item, comp_item, &is_null);
+    if (is_null)
+      new_item= new Item_null(name);
+    break;
+  }
   case STRING_RESULT:
   {
     char buff[MAX_FIELD_WIDTH];
@@ -8041,8 +8467,9 @@ void resolve_const_item(THD *thd, Item **ref, Item *comp_item)
                (Item*) new Item_decimal(name, result, length, decimals));
     break;
   }
-  default:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
+    break;
   }
   if (new_item)
     thd->change_item_tree(ref, new_item);
@@ -8062,6 +8489,9 @@ void resolve_const_item(THD *thd, Item **ref, Item *comp_item)
   @note We only use this on the range optimizer/partition pruning,
         because in some cases we can't store the value in the field
         without some precision/character loss.
+
+  @todo rewrite it to use Arg_comparator (currently it's a simplified and
+        incomplete version of it)
 */
 
 int stored_field_cmp_to_item(THD *thd, Field *field, Item *item)
@@ -8093,9 +8523,7 @@ int stored_field_cmp_to_item(THD *thd, Field *field, Item *item)
 
       if (field_type == MYSQL_TYPE_DATE)
         type= MYSQL_TIMESTAMP_DATE;
-
-      if (field_type == MYSQL_TYPE_DATETIME ||
-          field_type == MYSQL_TYPE_TIMESTAMP)
+      else
         type= MYSQL_TIMESTAMP_DATETIME;
         
       const char *field_name= field->field_name;
@@ -8119,6 +8547,25 @@ int stored_field_cmp_to_item(THD *thd, Field *field, Item *item)
     field_val= field->val_decimal(&field_buf);
     return my_decimal_cmp(item_val, field_val);
   }
+  /*
+    We have to check field->cmp_type() instead of res_type,
+    as result_type() - and thus res_type - can never be TIME_RESULT (yet).
+  */
+  if (field->cmp_type() == TIME_RESULT)
+  {
+    MYSQL_TIME field_time, item_time;
+    if (field->type() == MYSQL_TYPE_TIME)
+    {
+      field->get_time(&field_time);
+      item->get_time(&item_time);
+    }
+    else
+    {
+      field->get_date(&field_time, TIME_FUZZY_DATE | TIME_INVALID_DATES);
+      item->get_date(&item_time, TIME_FUZZY_DATE | TIME_INVALID_DATES);
+    }
+    return my_time_compare(&field_time, &item_time);
+  }
   double result= item->val_real();
   if (item->null_value)
     return 0;
@@ -8155,19 +8602,16 @@ Item_cache* Item_cache::get_cache(const Item *item, const Item_result type)
   case DECIMAL_RESULT:
     return new Item_cache_decimal();
   case STRING_RESULT:
-    /* Not all functions that return DATE/TIME are actually DATE/TIME funcs. */
-    if ((item->is_datetime() ||
-         item->field_type() == MYSQL_TYPE_TIME) &&
-        (const_cast<Item*>(item))->result_as_longlong())
-      return new Item_cache_datetime(item->field_type());
     return new Item_cache_str(item);
   case ROW_RESULT:
     return new Item_cache_row();
-  default:
-    // should never be in real life
+  case TIME_RESULT:
+    return new Item_cache_temporal(item->field_type());
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
-    return 0;
+    break;
   }
+  return 0;                                     // Impossible
 }
 
 void Item_cache::store(Item *item)
@@ -8180,6 +8624,11 @@ void Item_cache::store(Item *item)
 
 void Item_cache::print(String *str, enum_query_type query_type)
 {
+  if (value_cached)
+  {
+    print_value(str);
+    return;
+  }
   str->append(STRING_WITH_LEN("<cache>("));
   if (example)
     example->print(str, query_type);
@@ -8200,16 +8649,6 @@ bool  Item_cache_int::cache_value()
 }
 
 
-void Item_cache_int::store_longlong(Item *item, longlong val_arg)
-{
-  /* An explicit values is given, save it. */
-  value_cached= TRUE;
-  value= val_arg;
-  null_value= item->null_value;
-  unsigned_flag= item->unsigned_flag;
-}
-
-
 String *Item_cache_int::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
@@ -8245,171 +8684,104 @@ longlong Item_cache_int::val_int()
   return value;
 }
 
-bool  Item_cache_datetime::cache_value_int()
+int Item_cache_int::save_in_field(Field *field, bool no_conversions)
 {
-  if (!example)
-    return false;
-
-  value_cached= true;
-  // Mark cached string value obsolete
-  str_value_cached= false;
-
-  MYSQL_TIME ltime;
-  const bool eval_error= 
-    (field_type() == MYSQL_TYPE_TIME) ?
-    example->get_time(&ltime) :
-    example->get_date(&ltime, TIME_FUZZY_DATE);
-
-  if (eval_error)
-    int_value= 0;
-  else
-  {
-    switch(field_type())
-    {
-    case MYSQL_TYPE_DATETIME:
-    case MYSQL_TYPE_TIMESTAMP:
-      int_value= TIME_to_ulonglong_datetime(&ltime);
-      break;
-    case MYSQL_TYPE_TIME:
-      int_value= TIME_to_ulonglong_time(&ltime);
-      break;
-    default:
-      int_value= TIME_to_ulonglong_date(&ltime);
-      break;
-    }
-    if (ltime.neg)
-      int_value= -int_value;
-  }
+  int error;
+  if (!has_value())
+    return set_field_to_null_with_conversions(field, no_conversions);
 
-  null_value= example->null_value;
-  unsigned_flag= example->unsigned_flag;
+  field->set_notnull();
+  error= field->store(value, unsigned_flag);
 
-  return true;
+  return error ? error : field->table->in_use->is_error() ? 1 : 0;
 }
 
 
-bool  Item_cache_datetime::cache_value()
+Item_cache_temporal::Item_cache_temporal(enum_field_types field_type_arg):
+  Item_cache_int(field_type_arg)
 {
-  if (!example)
-    return FALSE;
-
-  if (cmp_context == INT_RESULT)
-    return cache_value_int();
-
-  str_value_cached= TRUE;
-  // Mark cached int value obsolete
-  value_cached= FALSE;
-  /* Assume here that the underlying item will do correct conversion.*/
-  String *res= example->str_result(&str_value);
-  if (res && res != &str_value)
-    str_value.copy(*res);
-  null_value= example->null_value;
-  unsigned_flag= example->unsigned_flag;
-  return TRUE;
+  if (mysql_type_to_time_type(cached_field_type) == MYSQL_TIMESTAMP_ERROR)
+    cached_field_type= MYSQL_TYPE_DATETIME;
 }
 
 
-void Item_cache_datetime::store(Item *item, longlong val_arg)
+String *Item_cache_temporal::val_str(String *str)
 {
-  /* An explicit values is given, save it. */
-  value_cached= TRUE;
-  int_value= val_arg;
-  null_value= item->null_value;
-  unsigned_flag= item->unsigned_flag;
+  DBUG_ASSERT(fixed == 1);
+  if (!has_value())
+  {
+    null_value= true;
+    return NULL;
+  }
+  return val_string_from_date(str);
 }
 
 
-void Item_cache_datetime::store(Item *item)
+bool  Item_cache_temporal::cache_value()
 {
-  Item_cache::store(item);
-  str_value_cached= FALSE;
+  if (!example)
+    return false;
+
+  value_cached= true;
+ 
+  MYSQL_TIME ltime;
+  if (example->get_date(&ltime, TIME_FUZZY_DATE))
+    value=0;
+  else
+    value= pack_time(&ltime);
+  null_value= example->null_value;
+  return true;
 }
 
-String *Item_cache_datetime::val_str(String *str)
+
+bool Item_cache_temporal::get_date(MYSQL_TIME *ltime, uint fuzzydate)
 {
-  DBUG_ASSERT(fixed == 1);
+  ErrConvInteger str(value);
 
-  if ((value_cached || str_value_cached) && null_value)
-    return NULL;
+  if (!has_value())
+  {
+    bzero((char*) ltime,sizeof(*ltime));
+    return 1;
+  }
 
-  if (!str_value_cached)
+  unpack_time(value, ltime);
+  ltime->time_type= mysql_type_to_time_type(field_type());
+  if (ltime->time_type == MYSQL_TIMESTAMP_TIME)
   {
-    /*
-      When it's possible the Item_cache_datetime uses INT datetime
-      representation due to speed reasons. But still, it always has the STRING
-      result type and thus it can be asked to return a string value. 
-      It is possible that at this time cached item doesn't contain correct
-      string value, thus we have to convert cached int value to string and
-      return it.
-    */
-    if (value_cached)
-    {
-      MYSQL_TIME ltime;
-      /* Return NULL in case of OOM/conversion error. */
-      null_value= TRUE;
-      if (str_value.alloc(MAX_DATE_STRING_REP_LENGTH))
-        return NULL;
-      if (cached_field_type == MYSQL_TYPE_TIME)
-      {
-        longlong time= int_value;
-        set_zero_time(&ltime, MYSQL_TIMESTAMP_TIME);
-        if (time < 0)
-        {
-          time= -time;
-          ltime.neg= TRUE;
-        }
-        DBUG_ASSERT(time <= TIME_MAX_VALUE);
-        ltime.second= time % 100;
-        time/= 100;
-        ltime.minute= time % 100;
-        time/= 100;
-        ltime.hour= time;
-      }
-      else
-      {
-        int was_cut;
-        longlong res;
-        res= number_to_datetime(int_value, &ltime, TIME_FUZZY_DATE, &was_cut);
-        if (res == -1)
-          return NULL;
-      }
-      str_value.length(my_TIME_to_str(&ltime,
-                                      const_cast<char*>(str_value.ptr())));
-      str_value_cached= TRUE;
-      null_value= FALSE;
-    }
-    else if (!cache_value())
-      return NULL;
+    ltime->hour+= (ltime->month*32+ltime->day)*24;
+    ltime->month= ltime->day= 0;
   }
-  return null_value ? NULL : &str_value;
+  return 0;
+ 
 }
 
 
-my_decimal *Item_cache_datetime::val_decimal(my_decimal *decimal_val)
+int Item_cache_temporal::save_in_field(Field *field, bool no_conversions)
 {
-  DBUG_ASSERT(fixed == 1);
+  int error;
   if (!has_value())
-    return NULL;
-  int2my_decimal(E_DEC_FATAL_ERROR, int_value, unsigned_flag, decimal_val);
-  return decimal_val;
-}
+    return set_field_to_null_with_conversions(field, no_conversions);
 
-double Item_cache_datetime::val_real()
-{
-  DBUG_ASSERT(fixed == 1);
-  if ((!value_cached && !cache_value_int()) || null_value)
-    return 0.0;
-  return (double) int_value;
+  field->set_notnull();
+ 
+  MYSQL_TIME ltime;
+  unpack_time(value, &ltime);
+  ltime.time_type= mysql_type_to_time_type(field_type());
+  error= field->store_time_dec(&ltime, decimals);
+ 
+  return error ? error : field->table->in_use->is_error() ? 1 : 0;
 }
 
-longlong Item_cache_datetime::val_int()
+
+void Item_cache_temporal::store_packed(longlong val_arg)
 {
-  DBUG_ASSERT(fixed == 1);
-  if ((!value_cached && !cache_value_int()) || null_value)
-    return 0;
-  return int_value;
+  /* An explicit values is given, save it. */
+  value_cached= true;
+  value= val_arg;
+  null_value= false;
 }
 
+
 bool Item_cache_real::cache_value()
 {
   if (!example)
@@ -8587,7 +8959,7 @@ my_decimal *Item_cache_str::val_decimal(my_decimal *decimal_val)
 int Item_cache_str::save_in_field(Field *field, bool no_conversions)
 {
   if (!has_value())
-    return 0;
+    return set_field_to_null_with_conversions(field, no_conversions);
   int res= Item_cache::save_in_field(field, no_conversions);
   return (is_varbinary && field->type() == MYSQL_TYPE_STRING &&
           value->length() < field->field_length) ? 1 : res;
@@ -8741,12 +9113,14 @@ Item_result Item_type_holder::result_type() const
 
 enum_field_types Item_type_holder::get_real_type(Item *item)
 {
+  if (item->type() == REF_ITEM)
+    item= item->real_item();
   switch(item->type())
   {
   case FIELD_ITEM:
   {
     /*
-      Item_fields::field_type ask Field_type() but sometimes field return
+      Item_field::field_type ask Field_type() but sometimes field return
       a different type, like for enum/set, so we need to ask real type.
     */
     Field *field= ((Item_field *) item)->field;
@@ -8788,7 +9162,8 @@ enum_field_types Item_type_holder::get_real_type(Item *item)
       case DECIMAL_RESULT:
         return MYSQL_TYPE_NEWDECIMAL;
       case ROW_RESULT:
-      default:
+      case TIME_RESULT:
+      case IMPOSSIBLE_RESULT:
         DBUG_ASSERT(0);
         return MYSQL_TYPE_VAR_STRING;
       }
@@ -9115,6 +9490,49 @@ void view_error_processor(THD *thd, void *data)
   ((TABLE_LIST *)data)->hide_view_error(thd);
 }
 
+
+st_select_lex *Item_ident::get_depended_from() const
+{
+  st_select_lex *dep;
+  if ((dep= depended_from))
+    for ( ; dep->merged_into; dep= dep->merged_into) ;
+  return dep;
+}
+
+
+table_map Item_ref::used_tables() const		
+{
+  return get_depended_from() ? OUTER_REF_TABLE_BIT : (*ref)->used_tables(); 
+}
+
+
+void Item_ref::update_used_tables() 
+{ 
+  if (!get_depended_from())
+    (*ref)->update_used_tables(); 
+}
+
+
+table_map Item_direct_view_ref::used_tables() const		
+{
+  return get_depended_from() ? 
+         OUTER_REF_TABLE_BIT :
+         (view->merged ? (*ref)->used_tables() : view->table->map); 
+}
+
+
+/*
+  we add RAND_TABLE_BIT to prevent moving this item from HAVING to WHERE
+*/
+table_map Item_ref_null_helper::used_tables() const
+{
+  return (get_depended_from() ?
+          OUTER_REF_TABLE_BIT :
+          (*ref)->used_tables() | RAND_TABLE_BIT);
+}
+
+
+
 /*****************************************************************************
 ** Instantiate templates
 *****************************************************************************/
diff --git a/sql/item.h b/sql/item.h
index b13438b248f..cda2ab73c4a 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -1,7 +1,8 @@
 #ifndef SQL_ITEM_INCLUDED
 #define SQL_ITEM_INCLUDED
 
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -28,6 +29,10 @@
 #include "thr_malloc.h"                         /* sql_calloc */
 #include "field.h"                              /* Derivation */
 
+C_MODE_START
+#include <ma_dyncol.h>
+C_MODE_END
+
 static inline
 bool trace_unsupported_func(const char *where, const char *processor_name)
 {
@@ -506,6 +511,17 @@ public:
 };
 
 
+struct st_dyncall_create_def
+{
+  Item  *num, *value;
+  CHARSET_INFO *cs;
+  uint len, frac;
+  DYNAMIC_COLUMN_TYPE type;
+};
+
+typedef struct st_dyncall_create_def DYNCALL_CREATE_DEF;
+
+
 typedef bool (Item::*Item_processor) (uchar *arg);
 /*
   Analyzer function
@@ -522,10 +538,24 @@ typedef bool (Item::*Item_analyzer) (uchar **argp);
 typedef Item* (Item::*Item_transformer) (uchar *arg);
 typedef void (*Cond_traverser) (const Item *item, void *arg);
 
+class Item_equal;
+class COND_EQUAL;
+
 
 class Item {
   Item(const Item &);			/* Prevent use of these */
   void operator=(Item &);
+  /**
+    The index in the JOIN::join_tab array of the JOIN_TAB this Item is attached
+    to. Items are attached (or 'pushed') to JOIN_TABs during optimization by the
+    make_cond_for_table procedure. During query execution, this item is
+    evaluated when the join loop reaches the corresponding JOIN_TAB.
+
+    If the value of join_tab_idx >= MAX_TABLES, this means that there is no
+    corresponding JOIN_TAB.
+  */
+  uint join_tab_idx;
+
 public:
   static void *operator new(size_t size) throw ()
   { return sql_alloc(size); }
@@ -572,16 +602,23 @@ public:
   Item *next;
   uint32 max_length;                    /* Maximum length, in bytes */
   /*
-    TODO: convert name and name_length fields into String to keep them in sync
-    (see bug #11829681/60295 etc).
+    TODO: convert name and name_length fields into LEX_STRING to keep them in
+    sync (see bug #11829681/60295 etc). Then also remove some strlen(name)
+    calls.
   */
   uint name_length;                     /* Length of name */
   int8 marker;
   uint8 decimals;
   bool maybe_null;			/* If item may be null */
+  bool in_rollup;                       /* If used in GROUP BY list
+                                           of a query with ROLLUP */ 
   bool null_value;			/* if item is null */
   bool unsigned_flag;
-  bool with_sum_func;
+  bool with_sum_func;                   /* True if item contains a sum func */
+  /**
+    True if any item except Item_sum_func contains a field. Set during parsing.
+  */
+  bool with_field;
   bool fixed;                           /* If item fixed with fix_fields */
   bool is_autogenerated_name;           /* indicate was name of this Item
                                            autogenerated or set by user */
@@ -621,10 +658,17 @@ public:
   virtual void fix_after_pullout(st_select_lex *new_parent, Item **ref) {};
 
   /*
-    should be used in case where we are sure that we do not need
+    This method should be used in case where we are sure that we do not need
     complete fix_fields() procedure.
+    Usually this method is used by the optimizer when it has to create a new
+    item out of other already fixed items. For example, if the optimizer has
+    to create a new Item_func for an inferred equality whose left and right
+    parts are already fixed items. In some cases the optimizer cannot use
+    directly fixed items as the arguments of the created functional item, 
+    but rather uses intermediate type conversion items. Then the method is
+    supposed to be applied recursively.  
   */
-  inline void quick_fix_field() { fixed= 1; }
+  virtual inline void quick_fix_field() { fixed= 1; }
   /* Function returns 1 on overflow and -1 on fatal errors */
   int save_in_field_no_warnings(Field *field, bool no_conversions);
   virtual int save_in_field(Field *field, bool no_conversions);
@@ -634,11 +678,20 @@ public:
   { return save_in_field(field, 1); }
   virtual bool send(Protocol *protocol, String *str);
   virtual bool eq(const Item *, bool binary_cmp) const;
+  /* result_type() of an item specifies how the value should be returned */
   virtual Item_result result_type() const { return REAL_RESULT; }
-  virtual Item_result cast_to_int_type() const { return result_type(); }
+  /* ... while cmp_type() specifies how it should be compared */
+  virtual Item_result cmp_type() const;
+  virtual Item_result cast_to_int_type() const { return cmp_type(); }
   virtual enum_field_types string_field_type() const;
   virtual enum_field_types field_type() const;
   virtual enum Type type() const =0;
+  /*
+    real_type() is the type of base item.  This is same as type() for
+    most items, except Item_ref() and Item_cache_wrapper() where it
+    shows the type for the underlaying item.
+  */
+  virtual enum Type real_type() const { return type(); }
   
   /*
     Return information about function monotonicity. See comment for
@@ -862,6 +915,7 @@ public:
   String *val_string_from_real(String *str);
   String *val_string_from_int(String *str);
   String *val_string_from_decimal(String *str);
+  String *val_string_from_date(String *str);
   my_decimal *val_decimal_from_real(my_decimal *decimal_value);
   my_decimal *val_decimal_from_int(my_decimal *decimal_value);
   my_decimal *val_decimal_from_string(my_decimal *decimal_value);
@@ -878,6 +932,8 @@ public:
   /* This is also used to create fields in CREATE ... SELECT: */
   virtual Field *tmp_table_field(TABLE *t_arg) { return 0; }
   virtual const char *full_name() const { return name ? name : "???"; }
+  const char *field_name_or_null()
+  { return real_item()->type() == Item::FIELD_ITEM ? name : NULL; }
 
   /*
     *result* family of methods is analog of *val* family (see above) but
@@ -892,13 +948,18 @@ public:
   { return val_decimal(val); }
   virtual bool val_bool_result() { return val_bool(); }
   virtual bool is_null_result() { return is_null(); }
-
+  /*
+    Returns 1 if result type and collation for val_str() can change between
+    calls
+  */
+  virtual bool dynamic_result() { return 0; }
   /* 
     Bitmap of tables used by item
     (note: if you need to check dependencies on individual columns, check out
      class Field_enumerator)
   */
   virtual table_map used_tables() const { return (table_map) 0L; }
+  virtual table_map all_used_tables() const { return used_tables(); }
   /*
     Return table map of tables that can't be NULL tables (tables that are
     used in a context where if they would contain a NULL row generated
@@ -953,6 +1014,7 @@ public:
   }
 
   void print_item_w_name(String *, enum_query_type query_type);
+  void print_value(String *);
   virtual void update_used_tables() {}
   virtual void split_sum_func(THD *thd, Item **ref_pointer_array,
                               List<Item> &fields) {}
@@ -960,7 +1022,9 @@ public:
   void split_sum_func2(THD *thd, Item **ref_pointer_array, List<Item> &fields,
                        Item **ref, bool skip_registered);
   virtual bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
-  virtual bool get_time(MYSQL_TIME *ltime);
+  bool get_time(MYSQL_TIME *ltime)
+  { return get_date(ltime, TIME_TIME_ONLY | TIME_FUZZY_DATE); }
+  bool get_seconds(ulonglong *sec, ulong *sec_part);
   virtual bool get_date_result(MYSQL_TIME *ltime,uint fuzzydate)
   { return get_date(ltime,fuzzydate); }
   /*
@@ -1075,10 +1139,14 @@ public:
   virtual bool reset_query_id_processor(uchar *query_id_arg) { return 0; }
   virtual bool is_expensive_processor(uchar *arg) { return 0; }
   virtual bool register_field_in_read_map(uchar *arg) { return 0; }
-  virtual bool cache_const_expr_analyzer(uchar **arg);
-  virtual Item* cache_const_expr_transformer(uchar *arg);
+  virtual bool register_field_in_write_map(uchar *arg) { return 0; }
   virtual bool enumerate_field_refs_processor(uchar *arg) { return 0; }
   virtual bool mark_as_eliminated_processor(uchar *arg) { return 0; }
+  virtual bool eliminate_subselect_processor(uchar *arg) { return 0; }
+  virtual bool set_fake_select_as_master_processor(uchar *arg) { return 0; }
+  virtual bool view_used_tables_processor(uchar *arg) { return 0; }
+  virtual bool eval_not_null_tables(uchar *opt_arg) { return 0; }
+  virtual bool clear_sum_processor(uchar *opt_arg) { return 0; }
 
   /* To call bool function for all arguments */
   struct bool_func_call_args
@@ -1094,12 +1162,16 @@ public:
       (this->*(info->bool_function))();
     return FALSE;
   }
+
   /*
     The next function differs from the previous one that a bitmap to be updated
     is passed as uchar *arg.
   */
   virtual bool register_field_in_bitmap(uchar *arg) { return 0; }
 
+  bool cache_const_expr_analyzer(uchar **arg);
+  Item* cache_const_expr_transformer(uchar *arg);
+
   /*
     Check if a partition function is allowed
     SYNOPSIS
@@ -1167,12 +1239,23 @@ public:
     return FALSE;
   }
 
+  /*
+    The enumeration Subst_constraint is currently used only in implementations
+    of the virtual function subst_argument_checker.
+  */ 
+  enum Subst_constraint 
+  { 
+    NO_SUBST= 0,         /* No substitution for a field is allowed   */
+    ANY_SUBST,           /* Any substitution for a field is allowed  */ 
+    IDENTITY_SUBST       /* Substitution for a field is allowed if any two
+                            different values of the field type are not equal */
+  };
+
   virtual bool subst_argument_checker(uchar **arg)
-  {
-    if (*arg)
-      *arg= NULL;
-    return TRUE;
+  { 
+    return (*arg != NULL); 
   }
+
   /*
     @brief
     Processor used to check acceptability of an item in the defining
@@ -1203,6 +1286,15 @@ public:
   {
     return FALSE;
   }
+  struct Collect_deps_prm
+  {
+    int nest_level;
+    List<Item> *parameters;
+  };
+  /**
+    Collect outer references
+  */
+  virtual bool collect_outer_ref_processor(uchar *arg) {return FALSE; }
 
   /**
     Find a function of a given type
@@ -1270,47 +1362,17 @@ public:
   {
     return 0;
   }
-  /*
-    result_as_longlong() must return TRUE for Items representing DATE/TIME
-    functions and DATE/TIME table fields.
-    Those Items have result_type()==STRING_RESULT (and not INT_RESULT), but
-    their values should be compared as integers (because the integer
-    representation is more precise than the string one).
-  */
-  virtual bool result_as_longlong() { return FALSE; }
-  inline bool is_datetime() const
-  {
-    switch (field_type())
-    {
-      case MYSQL_TYPE_DATE:
-      case MYSQL_TYPE_DATETIME:
-      case MYSQL_TYPE_TIMESTAMP:
-        return TRUE;
-      default:
-        break;
-    }
-    return FALSE;
-  }
   /**
     Check whether this and the given item has compatible comparison context.
     Used by the equality propagation. See Item_field::equal_fields_propagator.
 
     @return
-      TRUE  if the context is the same or if fields could be
-            compared as DATETIME values by the Arg_comparator.
+      TRUE  if the context is the same
       FALSE otherwise.
   */
   inline bool has_compatible_context(Item *item) const
   {
-    /* Same context. */
-    if (cmp_context == (Item_result)-1 || item->cmp_context == cmp_context)
-      return TRUE;
-    /* DATETIME comparison context. */
-    if (is_datetime())
-      return item->is_datetime() || item->cmp_context == STRING_RESULT;
-    if (item->is_datetime())
-      return is_datetime() || cmp_context == STRING_RESULT;
-    return FALSE;
+    return cmp_context == IMPOSSIBLE_RESULT || item->cmp_context == cmp_context;
   }
   /*
     Test whether an expression is expensive to compute. Used during
@@ -1362,19 +1424,53 @@ public:
     else
       max_length= (uint32) max_result_length;
   }
-  void fix_length_and_charset_datetime(uint32 max_char_length_arg)
-  {
-    collation.set(&my_charset_numeric, DERIVATION_NUMERIC, MY_REPERTOIRE_ASCII);
-    fix_char_length(max_char_length_arg);
-  }
   /*
     Return TRUE if the item points to a column of an outer-joined table.
   */
   virtual bool is_outer_field() const { DBUG_ASSERT(fixed); return FALSE; }
-  Item* set_expr_cache(THD *thd, List<Item*> &depends_on);
+  Item* set_expr_cache(THD *thd);
   virtual Item *get_cached_item() { return NULL; }
+
+  virtual Item_equal *get_item_equal() { return NULL; }
+  virtual void set_item_equal(Item_equal *item_eq) {};
+  virtual Item_equal *find_item_equal(COND_EQUAL *cond_equal) { return NULL; }
+  /**
+    Set the join tab index to the minimal (left-most) JOIN_TAB to which this
+    Item is attached. The number is an index is depth_first_tab() traversal
+    order.
+  */
+  virtual void set_join_tab_idx(uint join_tab_idx_arg)
+  {
+    if (join_tab_idx_arg < join_tab_idx)
+      join_tab_idx= join_tab_idx_arg;
+  }
+  virtual uint get_join_tab_idx() { return join_tab_idx; }
+
+  table_map view_used_tables(TABLE_LIST *view)
+  {
+    view->view_used_tables= 0;
+    walk(&Item::view_used_tables_processor, 0, (uchar *) view);
+    return view->view_used_tables;
+  }
+
+  /**
+    Collect and add to the list cache parameters for this Item.
+
+    @note Now implemented only for subqueries and in_optimizer,
+    if we need it for general function then this method should
+    be defined for Item_func.
+  */
+  virtual void get_cache_parameters(List<Item> &parameters) { };
 };
 
+
+/**
+  Compare two Items for List<Item>::add_unique()
+*/
+
+bool cmp_items(Item *a, Item *b);
+
+
 /*
   Class to be used to enumerate all field references in an item tree. This
   includes references to outside but not fields of the tables within a
@@ -1808,10 +1904,15 @@ public:
   Item_ident(TABLE_LIST *view_arg, const char *field_name_arg);
   const char *full_name() const;
   void cleanup();
+  st_select_lex *get_depended_from() const;
   bool remove_dependence_processor(uchar * arg);
   virtual void print(String *str, enum_query_type query_type);
   virtual bool change_context_processor(uchar *cntx)
     { context= (Name_resolution_context *)cntx; return FALSE; }
+  /**
+    Collect outer references
+  */
+  virtual bool collect_outer_ref_processor(uchar *arg);
   friend bool insert_fields(THD *thd, Name_resolution_context *context,
                             const char *db_name,
                             const char *table_name, List_iterator<Item> *it,
@@ -1842,9 +1943,6 @@ public:
 };
 
 
-class Item_equal;
-class COND_EQUAL;
-
 class Item_field :public Item_ident
 {
 protected:
@@ -1900,13 +1998,14 @@ public:
   int save_in_field(Field *field,bool no_conversions);
   void save_org_in_field(Field *field);
   table_map used_tables() const;
+  table_map all_used_tables() const; 
   enum Item_result result_type () const
   {
     return field->result_type();
   }
   Item_result cast_to_int_type() const
   {
-    return field->cast_to_int_type();
+    return field->cmp_type();
   }
   enum_field_types field_type() const
   {
@@ -1921,7 +2020,6 @@ public:
   Field *tmp_table_field(TABLE *t_arg) { return result_field; }
   bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
   bool get_date_result(MYSQL_TIME *ltime,uint fuzzydate);
-  bool get_time(MYSQL_TIME *ltime);
   bool is_null() { return field->is_null(); }
   void update_null_value();
   Item *get_tmp_table_item(THD *thd);
@@ -1929,16 +2027,15 @@ public:
   bool add_field_to_set_processor(uchar * arg);
   bool find_item_in_field_list_processor(uchar *arg);
   bool register_field_in_read_map(uchar *arg);
+  bool register_field_in_write_map(uchar *arg);
   bool register_field_in_bitmap(uchar *arg);
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
   bool vcol_in_partition_func_processor(uchar *bool_arg);
   bool check_vcol_func_processor(uchar *arg) { return FALSE;}
   bool enumerate_field_refs_processor(uchar *arg);
   void cleanup();
-  bool result_as_longlong()
-  {
-    return field->can_be_compared_as_longlong();
-  }
+  Item_equal *get_item_equal() { return item_equal; }
+  void set_item_equal(Item_equal *item_eq) { item_equal= item_eq; }
   Item_equal *find_item_equal(COND_EQUAL *cond_equal);
   bool subst_argument_checker(uchar **arg);
   Item *equal_fields_propagator(uchar *arg);
@@ -2096,6 +2193,7 @@ public:
   Item_param(uint pos_in_query_arg);
 
   enum Item_result result_type () const { return item_result_type; }
+  enum Item_result cast_to_int_type() const { return item_result_type; }
   enum Type type() const { return item_type; }
   enum_field_types field_type() const { return param_type; }
 
@@ -2103,7 +2201,6 @@ public:
   longlong val_int();
   my_decimal *val_decimal(my_decimal*);
   String *val_str(String*);
-  bool get_time(MYSQL_TIME *tm);
   bool get_date(MYSQL_TIME *tm, uint fuzzydate);
   int  save_in_field(Field *field, bool no_conversions);
 
@@ -2223,19 +2320,31 @@ class Item_uint :public Item_int
 {
 public:
   Item_uint(const char *str_arg, uint length);
-  Item_uint(ulonglong i) :Item_int((ulonglong) i, 10) {}
+  Item_uint(ulonglong i) :Item_int(i, 10) {}
   Item_uint(const char *str_arg, longlong i, uint length);
   double val_real()
     { DBUG_ASSERT(fixed == 1); return ulonglong2double((ulonglong)value); }
   String *val_str(String*);
   Item *clone_item() { return new Item_uint(name, value, max_length); }
-  int save_in_field(Field *field, bool no_conversions);
   virtual void print(String *str, enum_query_type query_type);
   Item_num *neg ();
   uint decimal_precision() const { return max_length; }
 };
 
 
+class Item_datetime :public Item_int
+{
+protected:
+  MYSQL_TIME ltime;
+public:
+  Item_datetime() :Item_int(0) { unsigned_flag=0; }
+  int save_in_field(Field *field, bool no_conversions);
+  longlong val_int();
+  double val_real() { return (double)val_int(); }
+  void set(longlong packed);
+};
+
+
 /* decimal (fixed point) constant */
 class Item_decimal :public Item_num
 {
@@ -2479,14 +2588,16 @@ private:
 
 
 longlong 
-longlong_from_string_with_check (CHARSET_INFO *cs, const char *cptr, char *end);
+longlong_from_string_with_check(CHARSET_INFO *cs, const char *cptr,
+                                const char *end);
 double 
-double_from_string_with_check (CHARSET_INFO *cs, const char *cptr, char *end);
+double_from_string_with_check(CHARSET_INFO *cs, const char *cptr,
+                              const char *end);
 
 class Item_static_string_func :public Item_string
 {
   const char *func_name;
- public:
+public:
   Item_static_string_func(const char *name_par, const char *str, uint length,
                           CHARSET_INFO *cs,
                           Derivation dv= DERIVATION_COERCIBLE)
@@ -2526,10 +2637,11 @@ class Item_return_date_time :public Item_partition_func_safe_string
 {
   enum_field_types date_time_field_type;
 public:
-  Item_return_date_time(const char *name_arg, enum_field_types field_type_arg)
-    :Item_partition_func_safe_string(name_arg, 0, &my_charset_bin),
+  Item_return_date_time(const char *name_arg, uint length_arg,
+                        enum_field_types field_type_arg)
+    :Item_partition_func_safe_string(name_arg, length_arg, &my_charset_bin),
      date_time_field_type(field_type_arg)
-  { }
+  { decimals= 0; }
   enum_field_types field_type() const { return date_time_field_type; }
 };
 
@@ -2660,11 +2772,12 @@ public:
   enum Ref_Type { REF, DIRECT_REF, VIEW_REF, OUTER_REF, AGGREGATE_REF };
   Field *result_field;			 /* Save result here */
   Item **ref;
+  bool reference_trough_name;
   Item_ref(Name_resolution_context *context_arg,
            const char *db_arg, const char *table_name_arg,
            const char *field_name_arg)
     :Item_ident(context_arg, db_arg, table_name_arg, field_name_arg),
-     result_field(0), ref(0) {}
+    result_field(0), ref(0), reference_trough_name(1) {}
   /*
     This constructor is used in two scenarios:
     A) *item = NULL
@@ -2689,6 +2802,8 @@ public:
   Item_ref(THD *thd, Item_ref *item)
     :Item_ident(thd, item), result_field(item->result_field), ref(item->ref) {}
   enum Type type() const		{ return REF_ITEM; }
+  enum Type real_type() const           { return ref ? (*ref)->type() :
+                                          REF_ITEM; }
   bool eq(const Item *item, bool binary_cmp) const
   { 
     Item *it= ((Item *) item)->real_item();
@@ -2720,20 +2835,16 @@ public:
   Field *get_tmp_table_field()
   { return result_field ? result_field : (*ref)->get_tmp_table_field(); }
   Item *get_tmp_table_item(THD *thd);
-  table_map used_tables() const		
-  {
-    return depended_from ? OUTER_REF_TABLE_BIT : (*ref)->used_tables(); 
-  }
-  void update_used_tables() 
-  { 
-    if (!depended_from) 
-      (*ref)->update_used_tables(); 
-  }
+  table_map used_tables() const;		
+  void update_used_tables(); 
   bool const_item() const 
   {
     return (*ref)->const_item();
   }
-  table_map not_null_tables() const { return (*ref)->not_null_tables(); }
+  table_map not_null_tables() const 
+  { 
+    return depended_from ? 0 : (*ref)->not_null_tables();
+  }
   void set_result_field(Field *field)	{ result_field= field; }
   bool is_result_field() { return 1; }
   void save_in_result_field(bool no_conversions)
@@ -2752,6 +2863,9 @@ public:
     else
       return FALSE;
   }
+  Item* transform(Item_transformer, uchar *arg);
+  Item* compile(Item_analyzer analyzer, uchar **arg_p,
+                Item_transformer transformer, uchar *arg_t);
   bool enumerate_field_refs_processor(uchar *arg)
   { return (*ref)->enumerate_field_refs_processor(arg); }
   void no_rows_in_result()
@@ -2763,10 +2877,6 @@ public:
     (*ref)->restore_to_before_no_rows_in_result();
   }
   virtual void print(String *str, enum_query_type query_type);
-  bool result_as_longlong()
-  {
-    return (*ref)->result_as_longlong();
-  }
   void cleanup();
   Item_field *filed_for_view_update()
     { return (*ref)->filed_for_view_update(); }
@@ -2803,12 +2913,7 @@ public:
   {
     return trace_unsupported_by_check_vcol_func_processor("ref");
   }
-  bool get_time(MYSQL_TIME *ltime)
-  {
-    DBUG_ASSERT(fixed);
-    return (*ref)->get_time(ltime);
-  }
-  virtual bool basic_const_item() const { return ref && (*ref)->basic_const_item(); }
+  bool basic_const_item() const { return ref && (*ref)->basic_const_item(); }
   bool is_outer_field() const
   {
     DBUG_ASSERT(fixed);
@@ -2852,6 +2957,40 @@ public:
   virtual Ref_Type ref_type() { return DIRECT_REF; }
 };
 
+
+/**
+  This class is the same as Item_direct_ref but created to wrap Item_ident
+  before fix_fields() call
+*/
+
+class Item_direct_ref_to_ident :public Item_direct_ref
+{
+  Item_ident *ident;
+public:
+  Item_direct_ref_to_ident(Item_ident *item)
+    :Item_direct_ref(item->context, (Item**)&item, item->table_name, item->field_name,
+                     FALSE)
+  {
+    ident= item;
+    ref= (Item**)&ident;
+  }
+
+  bool fix_fields(THD *thd, Item **it)
+  {
+    DBUG_ASSERT(ident->type() == FIELD_ITEM || ident->type() == REF_ITEM);
+    if ((!ident->fixed && ident->fix_fields(thd, ref)) ||
+        ident->check_cols(1))
+      return TRUE;
+    set_properties();
+    return FALSE;
+  }
+
+  virtual void print(String *str, enum_query_type query_type)
+  { ident->print(str, query_type); }
+
+};
+
+
 class Item_cache;
 class Expression_cache;
 
@@ -2874,8 +3013,11 @@ private:
   */
   Item_cache *expr_value;
 
+  List<Item> parameters;
+
   Item *check_cache();
-  inline void cache();
+  void cache();
+  void init_on_demand();
 
 public:
   Item_cache_wrapper(Item *item_arg);
@@ -2883,9 +3025,9 @@ public:
 
   const char *func_name() const { return "<expr_cache>"; }
   enum Type type() const { return EXPR_CACHE_ITEM; }
-  virtual Item *get_cached_item() { return orig_item; }
+  enum Type real_type() const { return orig_item->type(); }
 
-  bool set_cache(THD *thd, List<Item*> &depends_on);
+  bool set_cache(THD *thd);
 
   bool fix_fields(THD *thd, Item **it);
   void fix_length_and_dec() {}
@@ -2900,7 +3042,6 @@ public:
   bool val_bool();
   bool is_null();
   bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
-  bool get_time(MYSQL_TIME *ltime);
   bool send(Protocol *protocol, String *buffer);
   void save_org_in_field(Field *field)
   {
@@ -2940,7 +3081,6 @@ public:
   }
   bool enumerate_field_refs_processor(uchar *arg)
   { return orig_item->enumerate_field_refs_processor(arg); }
-  bool result_as_longlong() { return orig_item->result_as_longlong(); }
   Item_field *filed_for_view_update()
   { return orig_item->filed_for_view_update(); }
 
@@ -2964,6 +3104,9 @@ public:
     if (result_type() == ROW_RESULT)
       orig_item->bring_value();
   }
+  virtual bool is_expensive() { return orig_item->is_expensive(); }
+  bool is_expensive_processor(uchar *arg)
+  { return orig_item->is_expensive_processor(arg); }
   bool check_vcol_func_processor(uchar *arg)
   {
     return trace_unsupported_by_check_vcol_func_processor("cache");
@@ -2977,17 +3120,21 @@ public:
 */
 class Item_direct_view_ref :public Item_direct_ref
 {
+  Item_equal *item_equal;
+  TABLE_LIST *view;
 public:
   Item_direct_view_ref(Name_resolution_context *context_arg, Item **item,
-                  const char *table_name_arg,
-                  const char *field_name_arg)
-    :Item_direct_ref(context_arg, item, table_name_arg, field_name_arg) {}
+                       const char *table_name_arg,
+                       const char *field_name_arg,
+                       TABLE_LIST *view_arg)
+    :Item_direct_ref(context_arg, item, table_name_arg, field_name_arg),
+    item_equal(0), view(view_arg) {}
   /* Constructor need to process subselect with temporary tables (see Item) */
   Item_direct_view_ref(THD *thd, Item_direct_ref *item)
-    :Item_direct_ref(thd, item) {}
+    :Item_direct_ref(thd, item), item_equal(0) {}
   Item_direct_view_ref(TABLE_LIST *view_arg, Item **item,
                        const char *field_name_arg)
-    :Item_direct_ref(view_arg, item, field_name_arg)
+    :Item_direct_ref(view_arg, item, field_name_arg), item_equal(0)
   {}
 
   bool fix_fields(THD *, Item **);
@@ -2999,6 +3146,25 @@ public:
     return item;
   }
   virtual Ref_Type ref_type() { return VIEW_REF; }
+  Item_equal *get_item_equal() { return item_equal; }
+  void set_item_equal(Item_equal *item_eq) { item_equal= item_eq; }
+  Item_equal *find_item_equal(COND_EQUAL *cond_equal);
+  bool subst_argument_checker(uchar **arg);
+  Item *equal_fields_propagator(uchar *arg);
+  Item *replace_equal_field(uchar *arg);
+  table_map used_tables() const;	
+  bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
+  { 
+    return (*ref)->walk(processor, walk_subquery, arg) ||
+           (this->*processor)(arg);
+  }
+   bool view_used_tables_processor(uchar *arg) 
+  {
+    TABLE_LIST *view_arg= (TABLE_LIST *) arg;
+    if (view_arg == view)
+      view_arg->view_used_tables|= (*ref)->used_tables();
+    return 0;
+  }
 };
 
 
@@ -3089,15 +3255,7 @@ public:
   bool val_bool();
   bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
   virtual void print(String *str, enum_query_type query_type);
-  /*
-    we add RAND_TABLE_BIT to prevent moving this item from HAVING to WHERE
-  */
-  table_map used_tables() const
-  {
-    return (depended_from ?
-            OUTER_REF_TABLE_BIT :
-            (*ref)->used_tables() | RAND_TABLE_BIT);
-  }
+  table_map used_tables() const;
 };
 
 /*
@@ -3358,6 +3516,17 @@ public:
      value.
 */
 
+/*
+  Cached_item_XXX objects are not exactly caches. They do the following:
+
+  Each Cached_item_XXX object has
+   - its source item
+   - saved value of the source item
+   - cmp() method that compares the saved value with the current value of the
+     source item, and if they were not equal saves item's value into the saved
+     value.
+*/
+
 class Cached_item :public Sql_alloc
 {
 public:
@@ -3476,7 +3645,8 @@ public:
   {
     return Item_field::save_in_field(field_arg, no_conversions);
   }
-  /* 
+  enum Type type() const { return INSERT_VALUE_ITEM; }
+  /*
    We use RAND_TABLE_BIT to prevent Item_insert_value from
    being treated as a constant and precalculated before execution
   */
@@ -3582,6 +3752,13 @@ private:
 };
 
 
+/**
+  @todo
+  Implement the is_null() method for this class. Currently calling is_null()
+  on any Item_cache object resolves to Item::is_null(), which reutns FALSE
+  for any value.
+*/
+
 class Item_cache: public Item_basic_constant
 {
 protected:
@@ -3665,24 +3842,12 @@ public:
     return (value_cached || cache_value()) && !null_value;
   }
 
-  /** 
-    If this item caches a field value, return pointer to underlying field.
-
-    @return Pointer to field, or NULL if this is not a cache for a field value.
-  */
-  Field* field() { return cached_field; }
-
   virtual void store(Item *item);
   virtual bool cache_value()= 0;
   bool basic_const_item() const
   { return test(example && example->basic_const_item());}
   virtual void clear() { null_value= TRUE; value_cached= FALSE; }
-  Item_result result_type() const
-  {
-    if (!example)
-      return INT_RESULT;
-    return Field::result_merge_type(example->field_type());
-  }
+  bool is_null() { return null_value; }
 };
 
 
@@ -3691,20 +3856,41 @@ class Item_cache_int: public Item_cache
 protected:
   longlong value;
 public:
-  Item_cache_int(): Item_cache(),
+  Item_cache_int(): Item_cache(MYSQL_TYPE_LONGLONG),
     value(0) {}
   Item_cache_int(enum_field_types field_type_arg):
     Item_cache(field_type_arg), value(0) {}
 
-  virtual void store(Item *item){ Item_cache::store(item); }
-  void store_longlong(Item *item, longlong val_arg);
   double val_real();
   longlong val_int();
   String* val_str(String *str);
   my_decimal *val_decimal(my_decimal *);
   enum Item_result result_type() const { return INT_RESULT; }
-  bool result_as_longlong() { return TRUE; }
   bool cache_value();
+  int save_in_field(Field *field, bool no_conversions);
+};
+
+
+class Item_cache_temporal: public Item_cache_int
+{
+public:
+  Item_cache_temporal(enum_field_types field_type_arg);
+  String* val_str(String *str);
+  bool cache_value();
+  bool get_date(MYSQL_TIME *ltime, uint fuzzydate);
+  int save_in_field(Field *field, bool no_conversions);
+  void store_packed(longlong val_arg);
+  /*
+    Having a clone_item method tells optimizer that this object
+    is a constant and need not be optimized further.
+    Important when storing packed datetime values.
+  */
+  Item *clone_item()
+  {
+    Item_cache_temporal *item= new Item_cache_temporal(cached_field_type);
+    item->store_packed(value);
+    return item;
+  }
 };
 
 
@@ -3712,7 +3898,7 @@ class Item_cache_real: public Item_cache
 {
   double value;
 public:
-  Item_cache_real(): Item_cache(),
+  Item_cache_real(): Item_cache(MYSQL_TYPE_DOUBLE),
     value(0) {}
 
   double val_real();
@@ -3729,7 +3915,7 @@ class Item_cache_decimal: public Item_cache
 protected:
   my_decimal decimal_value;
 public:
-  Item_cache_decimal(): Item_cache() {}
+  Item_cache_decimal(): Item_cache(MYSQL_TYPE_NEWDECIMAL) {}
 
   double val_real();
   longlong val_int();
@@ -3835,40 +4021,6 @@ public:
 };
 
 
-class Item_cache_datetime: public Item_cache
-{
-protected:
-  String str_value;
-  longlong int_value;
-  bool str_value_cached;
-public:
-  Item_cache_datetime(enum_field_types field_type_arg):
-    Item_cache(field_type_arg), int_value(0), str_value_cached(0)
-  {
-    cmp_context= STRING_RESULT;
-  }
-
-  void store(Item *item, longlong val_arg);
-  void store(Item *item);
-  double val_real();
-  longlong val_int();
-  String* val_str(String *str);
-  my_decimal *val_decimal(my_decimal *);
-  enum Item_result result_type() const { return STRING_RESULT; }
-  bool result_as_longlong() { return TRUE; }
-  /*
-    In order to avoid INT <-> STRING conversion of a DATETIME value
-    two cache_value functions are introduced. One (cache_value) caches STRING
-    value, another (cache_value_int) - INT value. Thus this cache item
-    completely relies on the ability of the underlying item to do the
-    correct conversion.
-  */
-  bool cache_value_int();
-  bool cache_value();
-  void clear() { Item_cache::clear(); str_value_cached= FALSE; }
-};
-
-
 /*
   Item_type_holder used to store type. name, length of Item for UNIONS &
   derived tables.
@@ -3969,6 +4121,22 @@ public:
 
 
 /**
+  Item iterator over List_iterator_fast for Items
+*/
+
+class Item_iterator_list: public Item_iterator
+{
+  List_iterator<Item> list;
+public:
+  Item_iterator_list(List_iterator<Item> &arg_list):
+    list(arg_list) {}
+  void open() { list.rewind(); }
+  Item *next() { return (list++); }
+  void close() {}
+};
+
+
+/**
   Item iterator over Item interface for rows
 */
 
diff --git a/sql/item_buff.cc b/sql/item_buff.cc
index b0dbadcfda2..8c4224404d0 100644
--- a/sql/item_buff.cc
+++ b/sql/item_buff.cc
@@ -128,14 +128,20 @@ bool Cached_item_int::cmp(void)
 
 bool Cached_item_field::cmp(void)
 {
-  bool tmp= field->cmp(buff) != 0;		// This is not a blob!
-  if (tmp)
-    field->get_image(buff,length,field->charset());
+  bool tmp= FALSE;                              // Value is identical
+  /* Note that field can't be a blob here ! */
   if (null_value != field->is_null())
   {
     null_value= !null_value;
-    tmp=TRUE;
+    tmp= TRUE;                                  // Value has changed
   }
+
+  /*
+    If value is not null and value changed (from null to not null or
+    becasue of value change), then copy the new value to buffer.
+    */
+  if (! null_value && (tmp || (tmp= (field->cmp(buff) != 0))))
+    field->get_image(buff,length,field->charset());
   return tmp;
 }
 
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index b421dddf815..3e7cc24dfd8 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,11 +31,9 @@
 #include "sql_select.h"
 #include "sql_parse.h"                          // check_stack_overrun
 #include "sql_time.h"                  // make_truncated_value_warning
+#include "sql_base.h"                  // dynamic_column_error_message
 
-static bool convert_constant_item(THD *, Item_field *, Item **);
-static longlong
-get_year_value(THD *thd, Item ***item_arg, Item **cache_arg,
-               Item *warn_item, bool *is_null);
+static bool convert_const_to_int(THD *, Item_field *, Item **);
 
 static Item_result item_store_type(Item_result a, Item *item,
                                    my_bool unsigned_flag)
@@ -78,6 +77,30 @@ static void agg_result_type(Item_result *type, Item **items, uint nitems)
 }
 
 
+/**
+  find an temporal type (item) that others will be converted to
+  for the purpose of comparison.
+
+  this is the type that will be used in warnings like
+  "Incorrect <<TYPE>> value".
+*/
+Item *find_date_time_item(Item **args, uint nargs, uint col)
+{
+  Item *date_arg= 0, **arg, **arg_end;
+  for (arg= args, arg_end= args + nargs; arg != arg_end ; arg++)
+  {
+    Item *item= arg[0]->element_index(col);
+    if (item->cmp_type() != TIME_RESULT)
+      continue;
+    if (item->field_type() == MYSQL_TYPE_DATETIME)
+      return item;
+    if (!date_arg)
+      date_arg= item;
+  }
+  return date_arg;
+}
+
+
 /*
   Compare row signature of two expressions
 
@@ -140,10 +163,10 @@ static int cmp_row_type(Item* item1, Item* item2)
 static int agg_cmp_type(Item_result *type, Item **items, uint nitems)
 {
   uint i;
-  type[0]= items[0]->result_type();
+  type[0]= items[0]->cmp_type();
   for (i= 1 ; i < nitems ; i++)
   {
-    type[0]= item_cmp_type(type[0], items[i]->result_type());
+    type[0]= item_cmp_type(type[0], items[i]->cmp_type());
     /*
       When aggregating types of two row expressions we have to check
       that they have the same cardinality and that each component
@@ -209,7 +232,7 @@ static uint collect_cmp_types(Item **items, uint nitems, bool skip_nulls= FALSE)
 {
   uint i;
   uint found_types;
-  Item_result left_result= items[0]->result_type();
+  Item_result left_result= items[0]->cmp_type();
   DBUG_ASSERT(nitems > 1);
   found_types= 0;
   for (i= 1; i < nitems ; i++)
@@ -217,11 +240,11 @@ static uint collect_cmp_types(Item **items, uint nitems, bool skip_nulls= FALSE)
     if (skip_nulls && items[i]->type() == Item::NULL_ITEM)
       continue; // Skip NULL constant items
     if ((left_result == ROW_RESULT || 
-         items[i]->result_type() == ROW_RESULT) &&
+         items[i]->cmp_type() == ROW_RESULT) &&
         cmp_row_type(items[0], items[i]))
       return 0;
     found_types|= 1<< (uint)item_cmp_type(left_result,
-                                           items[i]->result_type());
+                                           items[i]->cmp_type());
   }
   /*
    Even if all right-hand items are NULLs and we are skipping them all, we need
@@ -247,36 +270,61 @@ Item_bool_func2* Eq_creator::create(Item *a, Item *b) const
   return new Item_func_eq(a, b);
 }
 
+Item_bool_func2* Eq_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_eq(b, a);
+}
 
 Item_bool_func2* Ne_creator::create(Item *a, Item *b) const
 {
   return new Item_func_ne(a, b);
 }
 
+Item_bool_func2* Ne_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_ne(b, a);
+}
 
 Item_bool_func2* Gt_creator::create(Item *a, Item *b) const
 {
   return new Item_func_gt(a, b);
 }
 
+Item_bool_func2* Gt_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_lt(b, a);
+}
 
 Item_bool_func2* Lt_creator::create(Item *a, Item *b) const
 {
   return new Item_func_lt(a, b);
 }
 
+Item_bool_func2* Lt_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_gt(b, a);
+}
 
 Item_bool_func2* Ge_creator::create(Item *a, Item *b) const
 {
   return new Item_func_ge(a, b);
 }
 
+Item_bool_func2* Ge_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_le(b, a);
+}
 
 Item_bool_func2* Le_creator::create(Item *a, Item *b) const
 {
   return new Item_func_le(a, b);
 }
 
+Item_bool_func2* Le_creator::create_swap(Item *a, Item *b) const
+{
+  return new Item_func_ge(b, a);
+}
+
 /*
   Test functions
   Most of these  returns 0LL if false and 1LL if true and
@@ -397,13 +445,25 @@ longlong Item_func_nop_all::val_int()
     1  Item was replaced with an integer version of the item
 */
 
-static bool convert_constant_item(THD *thd, Item_field *field_item,
+static bool convert_const_to_int(THD *thd, Item_field *field_item,
                                   Item **item)
 {
   Field *field= field_item->field;
   int result= 0;
 
-  if ((*item)->const_item())
+  /*
+    We don't need to convert an integer to an integer,
+    pretend it's already converted.
+
+    But we still convert it if it is compared with a Field_year,
+    as YEAR(2) may change the value of an integer when converting it
+    to an integer (say, 0 to 70).
+  */
+  if ((*item)->cmp_type() == INT_RESULT &&
+      field_item->field_type() != MYSQL_TYPE_YEAR)
+    return 1;
+
+  if ((*item)->const_item() && !(*item)->is_expensive())
   {
     TABLE *table= field->table;
     ulonglong orig_sql_mode= thd->variables.sql_mode;
@@ -414,7 +474,8 @@ static bool convert_constant_item(THD *thd, Item_field *field_item,
     LINT_INIT(old_maps[0]);
     LINT_INIT(old_maps[1]);
 
-    if (table)
+    /* table->read_set may not be set if we come here from a CREATE TABLE */
+    if (table && table->read_set)
       dbug_tmp_use_all_columns(table, old_maps, 
                                table->read_set, table->write_set);
     /* For comparison purposes allow invalid dates like 2000-01-32 */
@@ -423,17 +484,15 @@ static bool convert_constant_item(THD *thd, Item_field *field_item,
     thd->count_cuted_fields= CHECK_FIELD_IGNORE;
 
     /*
-      Store the value of the field/constant if it references an outer field
-      because the call to save_in_field below overrides that value.
-      Don't save field value if no data has been read yet.
-      Outer constant values are always saved.
+      Store the value of the field/constant because the call to save_in_field
+      below overrides that value. Don't save field value if no data has been
+      read yet.
     */
-    bool save_field_value= (field_item->depended_from &&
-                            (field_item->const_item() ||
-                             !(field->table->status & STATUS_NO_RECORD)));
+    bool save_field_value= (field_item->const_item() ||
+                            !(field->table->status & STATUS_NO_RECORD));
     if (save_field_value)
       orig_field_val= field->val_int();
-    if (!(*item)->is_null() && !(*item)->save_in_field(field, 1))
+    if (!(*item)->save_in_field(field, 1) && !field->is_null())
     {
       Item *tmp= new Item_int_with_ref(field->val_int(), *item,
                                        test(field->flags & UNSIGNED_FLAG));
@@ -450,7 +509,7 @@ static bool convert_constant_item(THD *thd, Item_field *field_item,
     }
     thd->variables.sql_mode= orig_sql_mode;
     thd->count_cuted_fields= orig_count_cuted_fields;
-    if (table)
+    if (table && table->read_set)
       dbug_tmp_restore_column_maps(table->read_set, table->write_set, old_maps);
   }
   return result;
@@ -481,7 +540,6 @@ void Item_bool_func2::fix_length_and_dec()
     to the collation of A.
   */
 
-  
   DTCollation coll;
   if (args[0]->result_type() == STRING_RESULT &&
       args[1]->result_type() == STRING_RESULT &&
@@ -490,48 +548,27 @@ void Item_bool_func2::fix_length_and_dec()
     
   args[0]->cmp_context= args[1]->cmp_context=
     item_cmp_type(args[0]->result_type(), args[1]->result_type());
-  // Make a special case of compare with fields to get nicer DATE comparisons
 
-  if (functype() == LIKE_FUNC)  // Disable conversion in case of LIKE function.
-  {
-    set_cmp_func();
-    return;
-  }
+  /*
+    Make a special case of compare with fields to get nicer comparisons
+    of numbers with constant string.
+    This directly contradicts the manual (number and a string should
+    be compared as doubles), but seems to provide more
+    "intuitive" behavior in some cases (but less intuitive in others).
 
+    But disable conversion in case of LIKE function.
+  */
   thd= current_thd;
-  if (!thd->lex->is_ps_or_view_context_analysis())
+  if (functype() != LIKE_FUNC && !thd->lex->is_ps_or_view_context_analysis())
   {
-    if (args[0]->real_item()->type() == FIELD_ITEM)
+    int field;
+    if (args[field= 0]->real_item()->type() == FIELD_ITEM ||
+        args[field= 1]->real_item()->type() == FIELD_ITEM)
     {
-      Item_field *field_item= (Item_field*) (args[0]->real_item());
-      if (field_item->field->can_be_compared_as_longlong() &&
-          !(field_item->is_datetime() &&
-            args[1]->result_type() == STRING_RESULT))
-      {
-        if (convert_constant_item(thd, field_item, &args[1]))
-        {
-          cmp.set_cmp_func(this, tmp_arg, tmp_arg+1,
-                           INT_RESULT);		// Works for all types.
-          args[0]->cmp_context= args[1]->cmp_context= INT_RESULT;
-          return;
-        }
-      }
-    }
-    if (args[1]->real_item()->type() == FIELD_ITEM)
-    {
-      Item_field *field_item= (Item_field*) (args[1]->real_item());
-      if (field_item->field->can_be_compared_as_longlong() &&
-          !(field_item->is_datetime() &&
-            args[0]->result_type() == STRING_RESULT))
-      {
-        if (convert_constant_item(thd, field_item, &args[0]))
-        {
-          cmp.set_cmp_func(this, tmp_arg, tmp_arg+1,
-                           INT_RESULT); // Works for all types.
-          args[0]->cmp_context= args[1]->cmp_context= INT_RESULT;
-          return;
-        }
-      }
+      Item_field *field_item= (Item_field*) (args[field]->real_item());
+      if (field_item->cmp_type() == INT_RESULT &&
+          convert_const_to_int(thd, field_item, &args[!field]))
+        args[0]->cmp_context= args[1]->cmp_context= INT_RESULT;
     }
   }
   set_cmp_func();
@@ -545,6 +582,9 @@ int Arg_comparator::set_compare_func(Item_result_field *item, Item_result type)
                          [is_owner_equal_func()];
 
   switch (type) {
+  case TIME_RESULT:
+    cmp_collation.collation= &my_charset_numeric;
+    break;
   case ROW_RESULT:
   {
     uint n= (*a)->cols();
@@ -638,8 +678,9 @@ int Arg_comparator::set_compare_func(Item_result_field *item, Item_result type)
     }
     break;
   }
-  default:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
+    break;
   }
   return 0;
 }
@@ -653,12 +694,14 @@ int Arg_comparator::set_compare_func(Item_result_field *item, Item_result type)
   @param[in]   warn_name  Field name for issuing the warning
   @param[out]  l_time     The MYSQL_TIME objects is initialized.
 
-  Parses a date provided in the string str into a MYSQL_TIME object. If the
-  string contains an incorrect date or doesn't correspond to a date at all
-  then a warning is issued. The warn_type and the warn_name arguments are used
-  as the name and the type of the field when issuing the warning. If any input
-  was discarded (trailing or non-timestamp-y characters), return value will be
-  TRUE.
+  Parses a date provided in the string str into a MYSQL_TIME object.
+  The date is used for comparison, that is fuzzy dates are allowed
+  independently of sql_mode.
+  If the string contains an incorrect date or doesn't correspond to a date at
+  all then a warning is issued. The warn_type and the warn_name arguments are
+  used as the name and the type of the field when issuing the warning. If any
+  input was discarded (trailing or non-timestamp-y characters), return value
+  will be TRUE.
 
   @return Status flag
   @retval FALSE Success.
@@ -671,16 +714,17 @@ bool get_mysql_time_from_str(THD *thd, String *str, timestamp_type warn_type,
   bool value;
   int error;
   enum_mysql_timestamp_type timestamp_type;
+  int flags= TIME_FUZZY_DATE | MODE_INVALID_DATES;
+  ErrConvString err(str);
+
+  if (warn_type == MYSQL_TIMESTAMP_TIME)
+    flags|= TIME_TIME_ONLY;
 
   timestamp_type= 
-    str_to_datetime(str->ptr(), str->length(), l_time,
-                    (TIME_FUZZY_DATE | MODE_INVALID_DATES |
-                     (thd->variables.sql_mode &
-                      (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE))),
-                    &error);
-
-  if (timestamp_type == MYSQL_TIMESTAMP_DATETIME || 
-      timestamp_type == MYSQL_TIMESTAMP_DATE)
+    str_to_datetime(str->charset(), str->ptr(), str->length(),
+                    l_time, flags, &error);
+
+  if (timestamp_type > MYSQL_TIMESTAMP_ERROR)
     /*
       Do not return yet, we may still want to throw a "trailing garbage"
       warning.
@@ -694,278 +738,38 @@ bool get_mysql_time_from_str(THD *thd, String *str, timestamp_type warn_type,
 
   if (error > 0)
     make_truncated_value_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                 str->ptr(), str->length(),
-                                 warn_type, warn_name);
+                                 &err, warn_type, warn_name);
 
   return value;
 }
 
 
 /**
-  @brief Convert date provided in a string to the int representation.
-
-  @param[in]   thd        thread handle
-  @param[in]   str        a string to convert
-  @param[in]   warn_type  type of the timestamp for issuing the warning
-  @param[in]   warn_name  field name for issuing the warning
-  @param[out]  error_arg  could not extract a DATE or DATETIME
+  Prepare the comparator (set the comparison function) for comparing
+  items *a1 and *a2 in the context of 'type'.
 
-  @details Convert date provided in the string str to the int
-    representation.  If the string contains wrong date or doesn't
-    contain it at all then a warning is issued.  The warn_type and
-    the warn_name arguments are used as the name and the type of the
-    field when issuing the warning.
+  @param[in]      owner_arg  Item, peforming the comparison (e.g. Item_func_eq)
+  @param[in,out]  a1         first argument to compare
+  @param[in,out]  a2         second argument to compare
+  @param[in]      type       type context to compare in
 
-  @return
-    converted value. 0 on error and on zero-dates -- check 'failure'
+  Both *a1 and *a2 can be replaced by this method - typically by constant
+  items, holding the cached converted value of the original (constant) item.
 */
-static ulonglong get_date_from_str(THD *thd, String *str, 
-                                   timestamp_type warn_type, 
-                                   const char *warn_name, bool *error_arg)
-{
-  MYSQL_TIME l_time;
-  *error_arg= get_mysql_time_from_str(thd, str, warn_type, warn_name, &l_time);
-
-  if (*error_arg)
-    return 0;
-  return TIME_to_ulonglong_datetime(&l_time);
-}
-
-
-/*
-  Check whether compare_datetime() can be used to compare items.
-
-  SYNOPSIS
-    Arg_comparator::can_compare_as_dates()
-    a, b          [in]  items to be compared
-    const_value   [out] converted value of the string constant, if any
-
-  DESCRIPTION
-    Check several cases when the DATE/DATETIME comparator should be used.
-    The following cases are checked:
-      1. Both a and b is a DATE/DATETIME field/function returning string or
-         int result.
-      2. Only a or b is a DATE/DATETIME field/function returning string or
-         int result and the other item (b or a) is an item with string result.
-         If the second item is a constant one then it's checked to be
-         convertible to the DATE/DATETIME type. If the constant can't be
-         converted to a DATE/DATETIME then the compare_datetime() comparator
-         isn't used and the warning about wrong DATE/DATETIME value is issued.
-      In all other cases (date-[int|real|decimal]/[int|real|decimal]-date)
-      the comparison is handled by other comparators.
-    If the datetime comparator can be used and one the operands of the
-    comparison is a string constant that was successfully converted to a
-    DATE/DATETIME type then the result of the conversion is returned in the
-    const_value if it is provided.  If there is no constant or
-    compare_datetime() isn't applicable then the *const_value remains
-    unchanged.
-
-  RETURN
-    the found type of date comparison
-*/
-
-enum Arg_comparator::enum_date_cmp_type
-Arg_comparator::can_compare_as_dates(Item *a, Item *b, ulonglong *const_value)
-{
-  enum enum_date_cmp_type cmp_type= CMP_DATE_DFLT;
-  Item *str_arg= 0, *date_arg= 0;
-
-  if (a->type() == Item::ROW_ITEM || b->type() == Item::ROW_ITEM)
-    return CMP_DATE_DFLT;
-
-  if (a->is_datetime())
-  {
-    if (b->is_datetime())
-      cmp_type= CMP_DATE_WITH_DATE;
-    else if (b->result_type() == STRING_RESULT)
-    {
-      cmp_type= CMP_DATE_WITH_STR;
-      date_arg= a;
-      str_arg= b;
-    }
-  }
-  else if (b->is_datetime() && a->result_type() == STRING_RESULT)
-  {
-    cmp_type= CMP_STR_WITH_DATE;
-    date_arg= b;
-    str_arg= a;
-  }
-
-  if (cmp_type != CMP_DATE_DFLT)
-  {
-    THD *thd= current_thd;
-    /*
-      Do not cache GET_USER_VAR() function as its const_item() may return TRUE
-      for the current thread but it still may change during the execution.
-      Don't use cache while in the context analysis mode only (i.e. for 
-      EXPLAIN/CREATE VIEW and similar queries). Cache is useless in such 
-      cases and can cause problems. For example evaluating subqueries can 
-      confuse storage engines since in context analysis mode tables 
-      aren't locked.
-    */
-    if (!thd->lex->is_ps_or_view_context_analysis() &&
-        cmp_type != CMP_DATE_WITH_DATE && str_arg->const_item() &&
-        (str_arg->type() != Item::FUNC_ITEM ||
-        ((Item_func*)str_arg)->functype() != Item_func::GUSERVAR_FUNC))
-    {
-      ulonglong value;
-      bool error;
-      String tmp, *str_val= 0;
-      timestamp_type t_type= (date_arg->field_type() == MYSQL_TYPE_DATE ?
-                              MYSQL_TIMESTAMP_DATE : MYSQL_TIMESTAMP_DATETIME);
-
-      str_val= str_arg->val_str(&tmp);
-      if (str_arg->null_value)
-        return CMP_DATE_DFLT;
-      value= get_date_from_str(thd, str_val, t_type, date_arg->name, &error);
-      if (error)
-        return CMP_DATE_DFLT;
-      if (const_value)
-        *const_value= value;
-    }
-  }
-  return cmp_type;
-}
-
-/*
-  Retrieves correct TIME value from the given item.
-
-  SYNOPSIS
-    get_time_value()
-    thd                 thread handle
-    item_arg   [in/out] item to retrieve TIME value from
-    cache_arg  [in/out] pointer to place to store the cache item to
-    warn_item  [in]     unused
-    is_null    [out]    TRUE <=> the item_arg is null
-
-  DESCRIPTION
-    Retrieves the correct TIME value from given item for comparison by the
-    compare_datetime() function.
-    If item's result can be compared as longlong then its int value is used
-    and a value returned by get_time function is used otherwise.
-    If an item is a constant one then its value is cached and it isn't
-    get parsed again. An Item_cache_int object is used for for cached values.
-    It seamlessly substitutes the original item.  The cache item is marked as
-    non-constant to prevent re-caching it again.
-
-  RETURN
-    obtained value
-*/
-
-longlong
-get_time_value(THD *thd, Item ***item_arg, Item **cache_arg,
-               Item *warn_item, bool *is_null)
-{
-  longlong value;
-  Item *item= **item_arg;
-  MYSQL_TIME ltime;
-
-  if (item->result_as_longlong())
-  {
-    value= item->val_int();
-    *is_null= item->null_value;
-  }
-  else
-  {
-    *is_null= item->get_time(&ltime);
-    value= !*is_null ? (longlong) TIME_to_ulonglong_datetime(&ltime) *
-                                  (ltime.neg ? -1 : 1) : 0;
-  }
-  /*
-    Do not cache GET_USER_VAR() function as its const_item() may return TRUE
-    for the current thread but it still may change during the execution.
-  */
-  if (item->const_item() && cache_arg &&
-      item->type() != Item::CACHE_ITEM &&
-      (item->type() != Item::FUNC_ITEM ||
-       ((Item_func*)item)->functype() != Item_func::GUSERVAR_FUNC))
-  {
-    Query_arena backup;
-    Query_arena *save_arena= thd->switch_to_arena_for_cached_items(&backup);
-    Item_cache_int *cache= new Item_cache_int();
-    if (save_arena)
-      thd->set_query_arena(save_arena);
-
-    /* Mark the cache as non-const to prevent re-caching. */
-    cache->set_used_tables(1);
-    cache->store_longlong(item, value);
-    *cache_arg= cache;
-    *item_arg= cache_arg;
-  }
-  return value;
-}
-
 
 int Arg_comparator::set_cmp_func(Item_result_field *owner_arg,
                                         Item **a1, Item **a2,
                                         Item_result type)
 {
-  ulonglong const_value= (ulonglong)-1;
   thd= current_thd;
   owner= owner_arg;
   set_null= set_null && owner_arg;
   a= a1;
   b= a2;
-  thd= current_thd;
 
-  if (can_compare_as_dates(*a, *b, &const_value))
-  {
-    a_type= (*a)->field_type();
-    b_type= (*b)->field_type();
-    a_cache= 0;
-    b_cache= 0;
-
-    if (const_value != (ulonglong)-1)
-    {
-      /*
-        cache_converted_constant can't be used here because it can't
-        correctly convert a DATETIME value from string to int representation.
-      */
-      Query_arena backup;
-      Query_arena *save_arena= thd->switch_to_arena_for_cached_items(&backup);
-      Item_cache_int *cache= new Item_cache_int(MYSQL_TYPE_DATETIME);
-      if (save_arena)
-        thd->set_query_arena(save_arena);
-
-      /* Mark the cache as non-const to prevent re-caching. */
-      cache->set_used_tables(1);
-      if (!(*a)->is_datetime())
-      {
-        cache->store_longlong((*a), const_value);
-        a_cache= cache;
-        a= (Item **)&a_cache;
-      }
-      else
-      {
-        cache->store_longlong((*b), const_value);
-        b_cache= cache;
-        b= (Item **)&b_cache;
-      }
-    }
-    is_nulls_eq= is_owner_equal_func();
-    func= &Arg_comparator::compare_datetime;
-    get_value_a_func= &get_datetime_value;
-    get_value_b_func= &get_datetime_value;
-    cmp_collation.set(&my_charset_numeric);
-    set_cmp_context_for_datetime();
-    return 0;
-  }
-  else if (type == STRING_RESULT && (*a)->field_type() == MYSQL_TYPE_TIME &&
-           (*b)->field_type() == MYSQL_TYPE_TIME)
-  {
-    /* Compare TIME values as integers. */
-    a_cache= 0;
-    b_cache= 0;
-    is_nulls_eq= is_owner_equal_func();
-    func= &Arg_comparator::compare_datetime;
-    get_value_a_func= &get_time_value;
-    get_value_b_func= &get_time_value;
-    set_cmp_context_for_datetime();
-    return 0;
-  }
-  else if (type == STRING_RESULT &&
-           (*a)->result_type() == STRING_RESULT &&
-           (*b)->result_type() == STRING_RESULT)
+  if (type == STRING_RESULT &&
+      (*a)->result_type() == STRING_RESULT &&
+      (*b)->result_type() == STRING_RESULT)
   {
     DTCollation coll;
     coll.set((*a)->collation.collation);
@@ -973,8 +777,10 @@ int Arg_comparator::set_cmp_func(Item_result_field *owner_arg,
                                b, 1, MY_COLL_CMP_CONV, 1))
       return 1;
   }
-  else if (try_year_cmp_func(type))
-    return 0;
+  if (type == INT_RESULT &&
+      (*a)->field_type() == MYSQL_TYPE_YEAR &&
+      (*b)->field_type() == MYSQL_TYPE_YEAR)
+    type= TIME_RESULT;
 
   a= cache_converted_constant(thd, a, &a_cache, type);
   b= cache_converted_constant(thd, b, &b_cache, type);
@@ -982,46 +788,6 @@ int Arg_comparator::set_cmp_func(Item_result_field *owner_arg,
 }
 
 
-/*
-  Helper function to call from Arg_comparator::set_cmp_func()
-*/
-
-bool Arg_comparator::try_year_cmp_func(Item_result type)
-{
-  if (type == ROW_RESULT)
-    return FALSE;
-
-  bool a_is_year= (*a)->field_type() == MYSQL_TYPE_YEAR;
-  bool b_is_year= (*b)->field_type() == MYSQL_TYPE_YEAR;
-
-  if (!a_is_year && !b_is_year)
-    return FALSE;
-
-  if (a_is_year && b_is_year)
-  {
-    get_value_a_func= &get_year_value;
-    get_value_b_func= &get_year_value;
-  }
-  else if (a_is_year && (*b)->is_datetime())
-  {
-    get_value_a_func= &get_year_value;
-    get_value_b_func= &get_datetime_value;
-  }
-  else if (b_is_year && (*a)->is_datetime())
-  {
-    get_value_b_func= &get_year_value;
-    get_value_a_func= &get_datetime_value;
-  }
-  else
-    return FALSE;
-
-  is_nulls_eq= is_owner_equal_func();
-  func= &Arg_comparator::compare_datetime;
-  set_cmp_context_for_datetime();
-
-  return TRUE;
-}
-
 /**
   Convert and cache a constant.
 
@@ -1043,9 +809,14 @@ Item** Arg_comparator::cache_converted_constant(THD *thd_arg, Item **value,
                                                 Item **cache_item,
                                                 Item_result type)
 {
-  /* Don't need cache if doing context analysis only. */
-  if (!thd->lex->is_ps_or_view_context_analysis() &&
-      (*value)->const_item() && type != (*value)->result_type())
+  /*
+    Don't need cache if doing context analysis only.
+    Also, get_datetime_value creates Item_cache internally.
+    Unless fixed, we should not do it here.
+  */
+  if (!thd_arg->lex->is_ps_or_view_context_analysis() &&
+      (*value)->const_item() && type != (*value)->result_type() &&
+      type != TIME_RESULT)
   {
     Item_cache *cache= Item_cache::get_cache(*value, type);
     cache->setup(*value);
@@ -1063,115 +834,78 @@ void Arg_comparator::set_datetime_cmp_func(Item_result_field *owner_arg,
   owner= owner_arg;
   a= a1;
   b= b1;
-  a_type= (*a)->field_type();
-  b_type= (*b)->field_type();
   a_cache= 0;
   b_cache= 0;
-  is_nulls_eq= FALSE;
-  func= &Arg_comparator::compare_datetime;
-  get_value_a_func= &get_datetime_value;
-  get_value_b_func= &get_datetime_value;
-  set_cmp_context_for_datetime();
+  func= comparator_matrix[TIME_RESULT][is_owner_equal_func()];
 }
 
-
-/*
+/**
   Retrieves correct DATETIME value from given item.
 
-  SYNOPSIS
-    get_datetime_value()
-    thd                 thread handle
-    item_arg   [in/out] item to retrieve DATETIME value from
-    cache_arg  [in/out] pointer to place to store the caching item to
-    warn_item  [in]     item for issuing the conversion warning
-    is_null    [out]    TRUE <=> the item_arg is null
+  @param[in]     thd         thread handle
+  @param[in,out] item_arg    item to retrieve DATETIME value from
+  @param[in,out] cache_arg   pointer to place to store the caching item to
+  @param[in]     warn_item   item for issuing the conversion warning
+  @param[out]    is_null     TRUE <=> the item_arg is null
 
-  DESCRIPTION
+  @details
     Retrieves the correct DATETIME value from given item for comparison by the
     compare_datetime() function.
-    If item's result can be compared as longlong then its int value is used
-    and its string value is used otherwise. Strings are always parsed and
-    converted to int values by the get_date_from_str() function.
-    This allows us to compare correctly string dates with missed insignificant
-    zeros. If an item is a constant one then its value is cached and it isn't
-    get parsed again. An Item_cache_int object is used for caching values. It
-    seamlessly substitutes the original item.  The cache item is marked as
-    non-constant to prevent re-caching it again.  In order to compare
-    correctly DATE and DATETIME items the result of the former are treated as
-    a DATETIME with zero time (00:00:00).
 
-  RETURN
-    obtained value
+    If the value should be compared as time (TIME_RESULT), it's retrieved as
+    MYSQL_TIME. Otherwise it's read as a number/string and converted to time.
+    Constant items are cached, so the convertion is only done once for them.
+
+    Note the f_type behavior: if the item can be compared as time, then
+    f_type is this item's field_type(). Otherwise it's field_type() of
+    warn_item (which is the other operand of the comparison operator).
+    This logic provides correct string/number to date/time conversion
+    depending on the other operand (when comparing a string with a date, it's
+    parsed as a date, when comparing a string with a time it's parsed as a time)
+
+    If the item is a constant it is replaced by the Item_cache_int, that
+    holds the packed datetime value.
+
+  @return
+    MYSQL_TIME value, packed in a longlong, suitable for comparison.
 */
 
 longlong
 get_datetime_value(THD *thd, Item ***item_arg, Item **cache_arg,
                    Item *warn_item, bool *is_null)
 {
-  longlong value= 0;
-  String buf, *str= 0;
+  longlong UNINIT_VAR(value);
   Item *item= **item_arg;
+  enum_field_types f_type= item->cmp_type() == TIME_RESULT ?
+                           item->field_type() : warn_item->field_type();
 
-  if (item->result_as_longlong())
+  if (item->result_type() == INT_RESULT && item->cmp_type() == TIME_RESULT)
   {
+    /* it's our Item_cache_temporal, as created below */
     value= item->val_int();
-    *is_null= item->null_value;
-    enum_field_types f_type= item->field_type();
-    /*
-      Item_date_add_interval may return MYSQL_TYPE_STRING as the result
-      field type. To detect that the DATE value has been returned we
-      compare it with 100000000L - any DATE value should be less than it.
-      Don't shift cached DATETIME values up for the second time.
-    */
-    if (f_type == MYSQL_TYPE_DATE ||
-        (f_type != MYSQL_TYPE_DATETIME && value < 100000000L))
-      value*= 1000000L;
   }
   else
   {
-    str= item->val_str(&buf);
-    *is_null= item->null_value;
+    MYSQL_TIME ltime;
+    uint fuzzydate= TIME_FUZZY_DATE | TIME_INVALID_DATES;
+    if (f_type == MYSQL_TYPE_TIME)
+      fuzzydate|= TIME_TIME_ONLY;
+    if (item->get_date(&ltime, fuzzydate))
+      value= 0; /* invalid date */
+    else
+      value= pack_time(&ltime);
   }
-  if (*is_null)
+  if ((*is_null= item->null_value))
     return ~(ulonglong) 0;
-  /*
-    Convert strings to the integer DATE/DATETIME representation.
-    Even if both dates provided in strings we can't compare them directly as
-    strings as there is no warranty that they are correct and do not miss
-    some insignificant zeros.
-  */
-  if (str)
-  {
-    bool error;
-    enum_field_types f_type= warn_item->field_type();
-    timestamp_type t_type= f_type ==
-      MYSQL_TYPE_DATE ? MYSQL_TIMESTAMP_DATE : MYSQL_TIMESTAMP_DATETIME;
-    value= (longlong) get_date_from_str(thd, str, t_type, warn_item->name, &error);
-    /*
-      If str did not contain a valid date according to the current
-      SQL_MODE, get_date_from_str() has already thrown a warning,
-      and we don't want to throw NULL on invalid date (see 5.2.6
-      "SQL modes" in the manual), so we're done here.
-    */
-  }
-  /*
-    Do not cache GET_USER_VAR() function as its const_item() may return TRUE
-    for the current thread but it still may change during the execution.
-  */
-  if (item->const_item() && cache_arg &&
-      item->type() != Item::CACHE_ITEM &&
-      (item->type() != Item::FUNC_ITEM ||
-       ((Item_func*)item)->functype() != Item_func::GUSERVAR_FUNC))
+  if (cache_arg && item->const_item() && item->type() != Item::CACHE_ITEM)
   {
     Query_arena backup;
     Query_arena *save_arena= thd->switch_to_arena_for_cached_items(&backup);
-    Item_cache_int *cache= new Item_cache_int(MYSQL_TYPE_DATETIME);
+    Item_cache_temporal *cache= new Item_cache_temporal(f_type);
     if (save_arena)
       thd->set_query_arena(save_arena);
-      
-    /* Mark the cache as non-const to prevent re-caching. */
-    cache->set_used_tables(1);
-    cache->store_longlong(item, value);
+
+    cache->store_packed(value);
     *cache_arg= cache;
     *item_arg= cache_arg;
   }
@@ -1180,67 +914,6 @@ get_datetime_value(THD *thd, Item ***item_arg, Item **cache_arg,
 
 
 /*
-  Retrieves YEAR value of 19XX-00-00 00:00:00 form from given item.
-
-  SYNOPSIS
-    get_year_value()
-    thd                 thread handle
-    item_arg   [in/out] item to retrieve YEAR value from
-    cache_arg  [in/out] pointer to place to store the caching item to
-    warn_item  [in]     item for issuing the conversion warning
-    is_null    [out]    TRUE <=> the item_arg is null
-
-  DESCRIPTION
-    Retrieves the YEAR value of 19XX form from given item for comparison by the
-    compare_datetime() function.
-    Converts year to DATETIME of form YYYY-00-00 00:00:00 for the compatibility
-    with the get_datetime_value function result.
-
-  RETURN
-    obtained value
-*/
-
-static longlong
-get_year_value(THD *thd, Item ***item_arg, Item **cache_arg,
-               Item *warn_item, bool *is_null)
-{
-  longlong value= 0;
-  Item *item= **item_arg;
-
-  value= item->val_int();
-  *is_null= item->null_value;
-  if (*is_null)
-    return ~(ulonglong) 0;
-
-  /*
-    Coerce value to the 19XX form in order to correctly compare
-    YEAR(2) & YEAR(4) types.
-    Here we are converting all item values but YEAR(4) fields since
-      1) YEAR(4) already has a regular YYYY form and
-      2) we don't want to convert zero/bad YEAR(4) values to the
-         value of 2000.
-  */
-  Item *real_item= item->real_item();
-  Field *field= NULL;
-  if (real_item->type() == Item::FIELD_ITEM)
-    field= ((Item_field *)real_item)->field;
-  else if (real_item->type() == Item::CACHE_ITEM)
-    field= ((Item_cache *)real_item)->field();
-  if (!(field && field->type() == MYSQL_TYPE_YEAR && field->field_length == 4))
-  {
-    if (value < 70)
-      value+= 100;
-    if (value <= 1900)
-      value+= 1900;
-  }
-  /* Convert year to DATETIME of form YYYY-00-00 00:00:00 (YYYY0000000000). */
-  value*= 10000000000LL;
-
-  return value;
-}
-
-
-/*
   Compare items values as dates.
 
   SYNOPSIS
@@ -1252,18 +925,9 @@ get_year_value(THD *thd, Item ***item_arg, Item **cache_arg,
     with help of the get_datetime_value() function.
 
   RETURN
-    If is_nulls_eq is TRUE:
-       1    if items are equal or both are null
-       0    otherwise
-    If is_nulls_eq is FALSE:
       -1   a < b or at least one item is null
        0   a == b
        1   a > b
-    See the table:
-    is_nulls_eq | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
-    a_is_null   | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
-    b_is_null   | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
-    result      | 1 | 0 | 0 |0/1|-1 |-1 |-1 |-1/0/1|
 */
 
 int Arg_comparator::compare_datetime()
@@ -1271,34 +935,40 @@ int Arg_comparator::compare_datetime()
   bool a_is_null, b_is_null;
   longlong a_value, b_value;
 
+  if (set_null)
+    owner->null_value= 1;
+
   /* Get DATE/DATETIME/TIME value of the 'a' item. */
-  a_value= (*get_value_a_func)(thd, &a, &a_cache, *b, &a_is_null);
-  if (!is_nulls_eq && a_is_null)
-  {
-    if (set_null)
-      owner->null_value= 1;
+  a_value= get_datetime_value(thd, &a, &a_cache, *b, &a_is_null);
+  if (a_is_null)
     return -1;
-  }
 
   /* Get DATE/DATETIME/TIME value of the 'b' item. */
-  b_value= (*get_value_b_func)(thd, &b, &b_cache, *a, &b_is_null);
-  if (a_is_null || b_is_null)
-  {
-    if (set_null)
-      owner->null_value= is_nulls_eq ? 0 : 1;
-    return is_nulls_eq ? (a_is_null == b_is_null) : -1;
-  }
+  b_value= get_datetime_value(thd, &b, &b_cache, *a, &b_is_null);
+  if (b_is_null)
+    return -1;
 
   /* Here we have two not-NULL values. */
   if (set_null)
     owner->null_value= 0;
 
   /* Compare values. */
-  if (is_nulls_eq)
-    return (a_value == b_value);
-  return a_value < b_value ? -1 : (a_value > b_value ? 1 : 0);
+  return a_value < b_value ? -1 : a_value > b_value ? 1 : 0;
 }
 
+int Arg_comparator::compare_e_datetime()
+{
+  bool a_is_null, b_is_null;
+  longlong a_value, b_value;
+
+  /* Get DATE/DATETIME/TIME value of the 'a' item. */
+  a_value= get_datetime_value(thd, &a, &a_cache, *b, &a_is_null);
+
+  /* Get DATE/DATETIME/TIME value of the 'b' item. */
+  b_value= get_datetime_value(thd, &b, &b_cache, *a, &b_is_null);
+  return a_is_null || b_is_null ? a_is_null == b_is_null
+                                : a_value == b_value;
+}
 
 int Arg_comparator::compare_string()
 {
@@ -1725,6 +1395,26 @@ longlong Item_func_truth::val_int()
 }
 
 
+bool Item_in_optimizer::is_top_level_item()
+{
+  return ((Item_in_subselect *)args[1])->is_top_level_item();
+}
+
+
+bool Item_in_optimizer::eval_not_null_tables(uchar *opt_arg)
+{
+  not_null_tables_cache= 0;
+  if (is_top_level_item())
+  {
+    /*
+      It is possible to determine NULL-rejectedness of the left arguments
+      of IN only if it is a top-level predicate.
+    */
+    not_null_tables_cache= args[0]->not_null_tables();
+  }
+  return FALSE;
+}
+
 bool Item_in_optimizer::fix_left(THD *thd, Item **ref)
 {
   if ((!args[0]->fixed && args[0]->fix_fields(thd, args)) ||
@@ -1751,10 +1441,14 @@ bool Item_in_optimizer::fix_left(THD *thd, Item **ref)
     }
     used_tables_cache= args[0]->used_tables();
   }
-  not_null_tables_cache= args[0]->not_null_tables();
+  eval_not_null_tables(NULL);
   with_sum_func= args[0]->with_sum_func;
+  with_field= args[0]->with_field;
   if ((const_item_cache= args[0]->const_item()))
+  {
     cache->store(args[0]);
+    cache->cache_value();
+  }
   return 0;
 }
 
@@ -1778,8 +1472,8 @@ bool Item_in_optimizer::fix_fields(THD *thd, Item **ref)
   if (args[1]->maybe_null)
     maybe_null=1;
   with_sum_func= with_sum_func || args[1]->with_sum_func;
+  with_field= with_field || args[1]->with_field;
   used_tables_cache|= args[1]->used_tables();
-  not_null_tables_cache|= args[1]->not_null_tables();
   const_item_cache&= args[1]->const_item();
   fixed= 1;
   return FALSE;
@@ -1808,28 +1502,40 @@ Item *Item_in_optimizer::expr_cache_insert_transformer(uchar *thd_arg)
 {
   THD *thd= (THD*) thd_arg;
   DBUG_ENTER("Item_in_optimizer::expr_cache_insert_transformer");
-  List<Item*> &depends_on= ((Item_subselect *)args[1])->depends_on;
+  if (args[1]->type() != Item::SUBSELECT_ITEM)
+    DBUG_RETURN(this); // MAX/MIN transformed => do nothing
 
   if (expr_cache)
     DBUG_RETURN(expr_cache);
 
+  if (args[1]->expr_cache_is_needed(thd) &&
+      (expr_cache= set_expr_cache(thd)))
+    DBUG_RETURN(expr_cache);
+
+  DBUG_RETURN(this);
+}
+
+
+
+/**
+    Collect and add to the list cache parameters for this Item.
+
+    @param parameters    The list where to add parameters
+*/
+
+void Item_in_optimizer::get_cache_parameters(List<Item> &parameters)
+{
   /* Add left expression to the list of the parameters of the subquery */
   if (args[0]->cols() == 1)
-    depends_on.push_front((Item**)args);
+    parameters.add_unique(args[0], &cmp_items);
   else
   {
     for (uint i= 0; i < args[0]->cols(); i++)
     {
-      depends_on.push_front(args[0]->addr(i));
+      parameters.add_unique(args[0]->element_index(i), &cmp_items);
     }
   }
-
-  if (args[1]->expr_cache_is_needed(thd) &&
-      (expr_cache= set_expr_cache(thd, depends_on)))
-    DBUG_RETURN(expr_cache);
-
-  depends_on.pop();
-  DBUG_RETURN(this);
+  args[1]->get_cache_parameters(parameters);
 }
 
 /**
@@ -1909,7 +1615,15 @@ longlong Item_in_optimizer::val_int()
   DBUG_ASSERT(fixed == 1);
   cache->store(args[0]);
   cache->cache_value();
-  
+
+  if (args[1]->type() != Item::SUBSELECT_ITEM)
+  {
+    /* MAX/MIN transformed => pass through */
+    longlong res= args[1]->val_int();
+    null_value= args[1]->null_value;
+    return (res);
+  }
+
   if (cache->null_value)
   {
     /*
@@ -2058,28 +1772,52 @@ Item *Item_in_optimizer::transform(Item_transformer transformer, uchar *argument
   if ((*args) != new_item)
     current_thd->change_item_tree(args, new_item);
 
-  /*
-    Transform the right IN operand which should be an Item_in_subselect or a
-    subclass of it. The left operand of the IN must be the same as the left
-    operand of this Item_in_optimizer, so in this case there is no further
-    transformation, we only make both operands the same.
-    TODO: is it the way it should be?
-  */
-  DBUG_ASSERT((args[1])->type() == Item::SUBSELECT_ITEM &&
-              (((Item_subselect*)(args[1]))->substype() ==
-               Item_subselect::IN_SUBS ||
-               ((Item_subselect*)(args[1]))->substype() ==
-               Item_subselect::ALL_SUBS ||
-               ((Item_subselect*)(args[1]))->substype() ==
-               Item_subselect::ANY_SUBS));
-
-  Item_in_subselect *in_arg= (Item_in_subselect*)args[1];
-  in_arg->left_expr= args[0];
+  if (args[1]->type() != Item::SUBSELECT_ITEM)
+  {
+    /* MAX/MIN transformed => pass through */
+    new_item= args[1]->transform(transformer, argument);
+    if (!new_item)
+      return 0;
+    if (args[1] != new_item)
+      current_thd->change_item_tree(args, new_item);
+  }
+  else
+  {
+    /*
+      Transform the right IN operand which should be an Item_in_subselect or a
+      subclass of it. The left operand of the IN must be the same as the left
+      operand of this Item_in_optimizer, so in this case there is no further
+      transformation, we only make both operands the same.
+      TODO: is it the way it should be?
+    */
+    DBUG_ASSERT((args[1])->type() == Item::SUBSELECT_ITEM &&
+                (((Item_subselect*)(args[1]))->substype() ==
+                 Item_subselect::IN_SUBS ||
+                 ((Item_subselect*)(args[1]))->substype() ==
+                 Item_subselect::ALL_SUBS ||
+                 ((Item_subselect*)(args[1]))->substype() ==
+                 Item_subselect::ANY_SUBS));
 
+    Item_in_subselect *in_arg= (Item_in_subselect*)args[1];
+    current_thd->change_item_tree(&in_arg->left_expr, args[0]);
+  }
   return (this->*transformer)(argument);
 }
 
 
+bool Item_in_optimizer::is_expensive_processor(uchar *arg)
+{
+  return args[0]->is_expensive_processor(arg) ||
+         args[1]->is_expensive_processor(arg);
+}
+
+
+bool Item_in_optimizer::is_expensive()
+{
+  return args[0]->is_expensive() || args[1]->is_expensive();
+}
+
+
 longlong Item_func_eq::val_int()
 {
   DBUG_ASSERT(fixed == 1);
@@ -2237,6 +1975,7 @@ void Item_func_interval::fix_length_and_dec()
   used_tables_cache|= row->used_tables();
   not_null_tables_cache= row->not_null_tables();
   with_sum_func= with_sum_func || row->with_sum_func;
+  with_field= with_field || row->with_field;
   const_item_cache&= row->const_item();
 }
 
@@ -2369,6 +2108,16 @@ bool Item_func_between::fix_fields(THD *thd, Item **ref)
 
   thd->lex->current_select->between_count++;
 
+
+  return 0;
+}
+
+
+bool Item_func_between::eval_not_null_tables(uchar *opt_arg)
+{
+  if (Item_func_opt_neg::eval_not_null_tables(NULL))
+    return 1;
+
   /* not_null_tables_cache == union(T1(e),T1(e1),T1(e2)) */
   if (pred_level && !negated)
     return 0;
@@ -2377,19 +2126,15 @@ bool Item_func_between::fix_fields(THD *thd, Item **ref)
   not_null_tables_cache= (args[0]->not_null_tables() |
                           (args[1]->not_null_tables() &
                            args[2]->not_null_tables()));
-
   return 0;
-}
+}  
 
 
 void Item_func_between::fix_length_and_dec()
 {
-  max_length= 1;
-  int i;
-  bool datetime_found= FALSE;
-  int time_items_found= 0;
-  compare_as_dates= TRUE;
   THD *thd= current_thd;
+  max_length= 1;
+  compare_as_dates= 0;
 
   /*
     As some compare functions are generated after sql_yacc,
@@ -2404,81 +2149,79 @@ void Item_func_between::fix_length_and_dec()
    return;
 
   /*
-    Detect the comparison of DATE/DATETIME items.
-    At least one of items should be a DATE/DATETIME item and other items
-    should return the STRING result.
+    When comparing as date/time, we need to convert non-temporal values
+    (e.g.  strings) to MYSQL_TIME. get_datetime_value() does it
+    automatically when one of the operands is a date/time.  But here we
+    may need to compare two strings as dates (str1 BETWEEN str2 AND date).
+    For this to work, we need to know what date/time type we compare
+    strings as.
   */
-  if (cmp_type == STRING_RESULT)
-  {
-    for (i= 0; i < 3; i++)
-    {
-      if (args[i]->is_datetime())
-      {
-        datetime_found= TRUE;
-        continue;
-      }
-      if (args[i]->field_type() == MYSQL_TYPE_TIME &&
-          args[i]->result_as_longlong())
-        time_items_found++;
-    }
-  }
-  if (!datetime_found)
-    compare_as_dates= FALSE;
+  if (cmp_type ==  TIME_RESULT)
+    compare_as_dates= find_date_time_item(args, 3, 0);
 
-  if (compare_as_dates)
-  {
-    ge_cmp.set_datetime_cmp_func(this, args, args + 1);
-    le_cmp.set_datetime_cmp_func(this, args, args + 2);
-  }
-  else if (time_items_found == 3)
-  {
-    /* Compare TIME items as integers. */
-    cmp_type= INT_RESULT;
-  }
-  else if (args[0]->real_item()->type() == FIELD_ITEM &&
-           thd->lex->sql_command != SQLCOM_CREATE_VIEW &&
-           thd->lex->sql_command != SQLCOM_SHOW_CREATE)
+  /* See the comment about the similar block in Item_bool_func2 */
+  if (args[0]->real_item()->type() == FIELD_ITEM &&
+      !thd->lex->is_ps_or_view_context_analysis())
   {
     Item_field *field_item= (Item_field*) (args[0]->real_item());
-    if (field_item->field->can_be_compared_as_longlong())
+    if (field_item->cmp_type() == INT_RESULT)
     {
       /*
-        The following can't be recoded with || as convert_constant_item
+        The following can't be recoded with || as convert_const_to_int
         changes the argument
       */
-      if (convert_constant_item(thd, field_item, &args[1]))
-        cmp_type=INT_RESULT;			// Works for all types.
-      if (convert_constant_item(thd, field_item, &args[2]))
-        cmp_type=INT_RESULT;			// Works for all types.
+      if (convert_const_to_int(thd, field_item, &args[1]))
+        cmp_type=INT_RESULT;
+      if (convert_const_to_int(thd, field_item, &args[2]))
+        cmp_type=INT_RESULT;
     }
   }
 }
 
 
 longlong Item_func_between::val_int()
-{						// ANSI BETWEEN
+{
   DBUG_ASSERT(fixed == 1);
-  if (compare_as_dates)
+
+  switch (cmp_type) {
+  case TIME_RESULT:
   {
-    int ge_res, le_res;
+    THD *thd= current_thd;
+    longlong value, a, b;
+    Item *cache, **ptr;
+    bool value_is_null, a_is_null, b_is_null;
 
-    ge_res= ge_cmp.compare();
-    if ((null_value= args[0]->null_value))
+    ptr= &args[0];
+    value= get_datetime_value(thd, &ptr, &cache, compare_as_dates,
+                              &value_is_null);
+    if (ptr != &args[0])
+      thd->change_item_tree(&args[0], *ptr);
+
+    if ((null_value= value_is_null))
       return 0;
-    le_res= le_cmp.compare();
 
-    if (!args[1]->null_value && !args[2]->null_value)
-      return (longlong) ((ge_res >= 0 && le_res <=0) != negated);
-    else if (args[1]->null_value)
-    {
-      null_value= le_res > 0;			// not null if false range.
-    }
+    ptr= &args[1];
+    a= get_datetime_value(thd, &ptr, &cache, compare_as_dates, &a_is_null);
+    if (ptr != &args[1])
+      thd->change_item_tree(&args[1], *ptr);
+
+    ptr= &args[2];
+    b= get_datetime_value(thd, &ptr, &cache, compare_as_dates, &b_is_null);
+    if (ptr != &args[2])
+      thd->change_item_tree(&args[2], *ptr);
+
+    if (!a_is_null && !b_is_null)
+      return (longlong) ((value >= a && value <= b) != negated);
+    if (a_is_null && b_is_null)
+      null_value=1;
+    else if (a_is_null)
+      null_value= value <= b;			// not null if false range.
     else
-    {
-      null_value= ge_res < 0;
-    }
+      null_value= value >= a;
+    break;
   }
-  else if (cmp_type == STRING_RESULT)
+
+  case STRING_RESULT:
   {
     String *value,*a,*b;
     value=args[0]->val_str(&value0);
@@ -2502,8 +2245,9 @@ longlong Item_func_between::val_int()
       // Set to not null if false range.
       null_value= sortcmp(value,a,cmp_collation.collation) >= 0;
     }
+    break;
   }
-  else if (cmp_type == INT_RESULT)
+  case INT_RESULT:
   {
     longlong value=args[0]->val_int(), a, b;
     if ((null_value=args[0]->null_value))
@@ -2522,8 +2266,9 @@ longlong Item_func_between::val_int()
     {
       null_value= value >= a;
     }
+    break;
   }
-  else if (cmp_type == DECIMAL_RESULT)
+  case DECIMAL_RESULT:
   {
     my_decimal dec_buf, *dec= args[0]->val_decimal(&dec_buf),
                a_buf, *a_dec, b_buf, *b_dec;
@@ -2540,8 +2285,9 @@ longlong Item_func_between::val_int()
       null_value= (my_decimal_cmp(dec, b_dec) <= 0);
     else
       null_value= (my_decimal_cmp(dec, a_dec) >= 0);
+    break;
   }
-  else
+  case REAL_RESULT:
   {
     double value= args[0]->val_real(),a,b;
     if ((null_value=args[0]->null_value))
@@ -2560,6 +2306,13 @@ longlong Item_func_between::val_int()
     {
       null_value= value >= a;
     }
+    break;
+  }
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
+    DBUG_ASSERT(0);
+    null_value= 1;
+    return 0;
   }
   return (longlong) (!null_value && negated);
 }
@@ -2612,7 +2365,8 @@ Item_func_ifnull::fix_length_and_dec()
     decimals= 0;
     break;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   fix_char_length(char_length);
@@ -2743,6 +2497,16 @@ Item_func_if::fix_fields(THD *thd, Item **ref)
   if (Item_func::fix_fields(thd, ref))
     return 1;
 
+  return 0;
+}
+
+
+bool
+Item_func_if::eval_not_null_tables(uchar *opt_arg)
+{
+  if (Item_func::eval_not_null_tables(NULL))
+    return 1;
+
   not_null_tables_cache= (args[1]->not_null_tables() &
                           args[2]->not_null_tables());
 
@@ -3228,13 +2992,14 @@ void Item_func_case::fix_length_and_dec()
         args[nagg * 2]= agg[nagg + 1];
     }
 
-    for (i= 0; i <= (uint)DECIMAL_RESULT; i++)
+    for (i= 0; i <= (uint)TIME_RESULT; i++)
     {
       if (found_types & (1 << i) && !cmp_items[i])
       {
         DBUG_ASSERT((Item_result)i != ROW_RESULT);
+        DBUG_ASSERT((Item_result)i != TIME_RESULT);
         if (!(cmp_items[i]=
-            cmp_item::get_comparator((Item_result)i,
+            cmp_item::get_comparator((Item_result)i, 0,
                                      cmp_collation.collation)))
           return;
       }
@@ -3260,7 +3025,8 @@ void Item_func_case::fix_length_and_dec()
       agg_num_lengths(args[i + 1]);
     if (else_expr_num != -1) 
       agg_num_lengths(args[else_expr_num]);
-    max_length= my_decimal_precision_to_length(max_length + decimals, decimals,
+    max_length= my_decimal_precision_to_length_no_truncation(max_length +
+                                                             decimals, decimals,
                                                unsigned_flag);
   }
 }
@@ -3314,7 +3080,7 @@ void Item_func_case::cleanup()
   uint i;
   DBUG_ENTER("Item_func_case::cleanup");
   Item_func::cleanup();
-  for (i= 0; i <= (uint)DECIMAL_RESULT; i++)
+  for (i= 0; i <= (uint)TIME_RESULT; i++)
   {
     delete cmp_items[i];
     cmp_items[i]= 0;
@@ -3370,6 +3136,21 @@ double Item_func_coalesce::real_op()
 }
 
 
+bool Item_func_coalesce::get_date(MYSQL_TIME *ltime,uint fuzzydate)
+{
+  DBUG_ASSERT(fixed == 1);
+  null_value= 0;
+  for (uint i= 0; i < arg_count; i++)
+  {
+    bool res= args[i]->get_date(ltime, fuzzydate);
+    if (!args[i]->null_value)
+      return res;
+  }
+  null_value=1;
+  return 1;
+}
+
+
 my_decimal *Item_func_coalesce::decimal_op(my_decimal *decimal_value)
 {
   DBUG_ASSERT(fixed == 1);
@@ -3389,6 +3170,14 @@ void Item_func_coalesce::fix_length_and_dec()
 {
   cached_field_type= agg_field_type(args, arg_count);
   agg_result_type(&hybrid_type, args, arg_count);
+  Item_result cmp_type;
+  agg_cmp_type(&cmp_type, args, arg_count);
+  ///< @todo let result_type() return TIME_RESULT and remove this special case
+  if (cmp_type == TIME_RESULT)
+  {
+    count_real_length();
+    return;
+  }
   switch (hybrid_type) {
   case STRING_RESULT:
     decimals= NOT_FIXED_DEC;
@@ -3407,7 +3196,8 @@ void Item_func_coalesce::fix_length_and_dec()
     decimals= 0;
     break;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
 }
@@ -3722,7 +3512,7 @@ uchar *in_decimal::get_value(Item *item)
 }
 
 
-cmp_item* cmp_item::get_comparator(Item_result type,
+cmp_item* cmp_item::get_comparator(Item_result type, Item *warn_item,
                                    CHARSET_INFO *cs)
 {
   switch (type) {
@@ -3736,7 +3526,10 @@ cmp_item* cmp_item::get_comparator(Item_result type,
     return new cmp_item_row;
   case DECIMAL_RESULT:
     return new cmp_item_decimal;
-  default:
+  case TIME_RESULT:
+    DBUG_ASSERT(warn_item);
+    return new cmp_item_datetime(warn_item);
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
     break;
   }
@@ -3801,7 +3594,7 @@ void cmp_item_row::store_value(Item *item)
     {
       if (!comparators[i])
         if (!(comparators[i]=
-              cmp_item::get_comparator(item->element_index(i)->result_type(),
+              cmp_item::get_comparator(item->element_index(i)->result_type(), 0,
                                        item->element_index(i)->collation.collation)))
 	  break;					// new failed
       comparators[i]->store_value(item->element_index(i));
@@ -3977,11 +3770,22 @@ bool Item_func_in::nulls_in_row()
 bool
 Item_func_in::fix_fields(THD *thd, Item **ref)
 {
-  Item **arg, **arg_end;
 
   if (Item_func_opt_neg::fix_fields(thd, ref))
     return 1;
 
+  return 0;
+}
+
+
+bool
+Item_func_in::eval_not_null_tables(uchar *opt_arg)
+{
+  Item **arg, **arg_end;
+
+  if (Item_func_opt_neg::eval_not_null_tables(NULL))
+    return 1;
+
   /* not_null_tables_cache == union(T1(e),union(T1(ei))) */
   if (pred_level && negated)
     return 0;
@@ -4002,20 +3806,17 @@ static int srtcmp_in(CHARSET_INFO *cs, const String *x,const String *y)
                                (uchar *) y->ptr(),y->length(), 0);
 }
 
-
 void Item_func_in::fix_length_and_dec()
 {
   Item **arg, **arg_end;
   bool const_itm= 1;
   THD *thd= current_thd;
-  bool datetime_found= FALSE;
   /* TRUE <=> arguments values will be compared as DATETIMEs. */
-  bool compare_as_datetime= FALSE;
   Item *date_arg= 0;
   uint found_types= 0;
   uint type_cnt= 0, i;
   Item_result cmp_type= STRING_RESULT;
-  left_result_type= args[0]->result_type();
+  left_result_type= args[0]->cmp_type();
   if (!(found_types= collect_cmp_types(args, arg_count, true)))
     return;
   
@@ -4027,7 +3828,7 @@ void Item_func_in::fix_length_and_dec()
       break;
     }
   }
-  for (i= 0; i <= (uint)DECIMAL_RESULT; i++)
+  for (i= 0; i <= (uint)TIME_RESULT; i++)
   {
     if (found_types & 1 << i)
     {
@@ -4042,16 +3843,12 @@ void Item_func_in::fix_length_and_dec()
         agg_arg_charsets_for_comparison(cmp_collation, args, arg_count))
       return;
     arg_types_compatible= TRUE;
-  }
-  if (type_cnt == 1)
-  {
-    /*
-      When comparing rows create the row comparator object beforehand to ease
-      the DATETIME comparison detection procedure.
-    */
+
     if (cmp_type == ROW_RESULT)
     {
+      uint cols= args[0]->cols();
       cmp_item_row *cmp= 0;
+
       if (const_itm && !nulls_in_row())
       {
         array= new in_row(arg_count-1, 0);
@@ -4063,66 +3860,20 @@ void Item_func_in::fix_length_and_dec()
           return;
         cmp_items[ROW_RESULT]= cmp;
       }
-      cmp->n= args[0]->cols();
+      cmp->n= cols;
       cmp->alloc_comparators();
-    }
-    /* All DATE/DATETIME fields/functions has the STRING result type. */
-    if (cmp_type == STRING_RESULT || cmp_type == ROW_RESULT)
-    {
-      uint col, cols= args[0]->cols();
 
-      for (col= 0; col < cols; col++)
+      for (uint col= 0; col < cols; col++)
       {
-        bool skip_column= FALSE;
-        /*
-          Check that all items to be compared has the STRING result type and at
-          least one of them is a DATE/DATETIME item.
-        */
-        for (arg= args, arg_end= args + arg_count; arg != arg_end ; arg++)
-        {
-          Item *itm= ((cmp_type == STRING_RESULT) ? arg[0] :
-                      arg[0]->element_index(col));
-          if (itm->result_type() != STRING_RESULT)
-          {
-            skip_column= TRUE;
-            break;
-          }
-          else if (itm->is_datetime())
-          {
-            datetime_found= TRUE;
-            /*
-              Internally all DATE/DATETIME values are converted to the DATETIME
-              type. So try to find a DATETIME item to issue correct warnings.
-            */
-            if (!date_arg)
-              date_arg= itm;
-            else if (itm->field_type() == MYSQL_TYPE_DATETIME)
-            {
-              date_arg= itm;
-              /* All arguments are already checked to have the STRING result. */
-              if (cmp_type == STRING_RESULT)
-                break;
-            }
-          }
-        }
-        if (skip_column)
-          continue;
-        if (datetime_found)
+        date_arg= find_date_time_item(args, arg_count, col);
+        if (date_arg)
         {
-          if (cmp_type == ROW_RESULT)
-          {
-            cmp_item **cmp= 0;
-            if (array)
-              cmp= ((in_row*)array)->tmp.comparators + col;
-            else
-              cmp= ((cmp_item_row*)cmp_items[ROW_RESULT])->comparators + col;
-            *cmp= new cmp_item_datetime(date_arg);
-            /* Reset variables for the next column. */
-            date_arg= 0;
-            datetime_found= FALSE;
-          }
+          cmp_item **cmp= 0;
+          if (array)
+            cmp= ((in_row*)array)->tmp.comparators + col;
           else
-            compare_as_datetime= TRUE;
+            cmp= ((cmp_item_row*)cmp_items[ROW_RESULT])->comparators + col;
+          *cmp= new cmp_item_datetime(date_arg);
         }
       }
     }
@@ -4133,62 +3884,61 @@ void Item_func_in::fix_length_and_dec()
   */
   if (type_cnt == 1 && const_itm && !nulls_in_row())
   {
-    if (compare_as_datetime)
-      array= new in_datetime(date_arg, arg_count - 1);
-    else
+    /*
+      IN must compare INT columns and constants as int values (the same
+      way as equality does).
+      So we must check here if the column on the left and all the constant 
+      values on the right can be compared as integers and adjust the 
+      comparison type accordingly.
+
+      See the comment about the similar block in Item_bool_func2
+    */  
+    if (args[0]->real_item()->type() == FIELD_ITEM &&
+        !thd->lex->is_view_context_analysis() && cmp_type != INT_RESULT)
     {
-      /*
-        IN must compare INT columns and constants as int values (the same
-        way as equality does).
-        So we must check here if the column on the left and all the constant 
-        values on the right can be compared as integers and adjust the 
-        comparison type accordingly.
-      */  
-      if (args[0]->real_item()->type() == FIELD_ITEM &&
-          thd->lex->sql_command != SQLCOM_CREATE_VIEW &&
-          thd->lex->sql_command != SQLCOM_SHOW_CREATE &&
-          cmp_type != INT_RESULT)
+      Item_field *field_item= (Item_field*) (args[0]->real_item());
+      if (field_item->cmp_type() == INT_RESULT)
       {
-        Item_field *field_item= (Item_field*) (args[0]->real_item());
-        if (field_item->field->can_be_compared_as_longlong())
+        bool all_converted= TRUE;
+        for (arg=args+1, arg_end=args+arg_count; arg != arg_end ; arg++)
         {
-          bool all_converted= TRUE;
-          for (arg=args+1, arg_end=args+arg_count; arg != arg_end ; arg++)
-          {
-            if (!convert_constant_item (thd, field_item, &arg[0]))
-              all_converted= FALSE;
-          }
-          if (all_converted)
-            cmp_type= INT_RESULT;
+           if (!convert_const_to_int(thd, field_item, &arg[0]))
+            all_converted= FALSE;
         }
-      }
-      switch (cmp_type) {
-      case STRING_RESULT:
-        array=new in_string(arg_count-1,(qsort2_cmp) srtcmp_in, 
-                            cmp_collation.collation);
-        break;
-      case INT_RESULT:
-        array= new in_longlong(arg_count-1);
-        break;
-      case REAL_RESULT:
-        array= new in_double(arg_count-1);
-        break;
-      case ROW_RESULT:
-        /*
-          The row comparator was created at the beginning but only DATETIME
-          items comparators were initialized. Call store_value() to setup
-          others.
-        */
-        ((in_row*)array)->tmp.store_value(args[0]);
-        break;
-      case DECIMAL_RESULT:
-        array= new in_decimal(arg_count - 1);
-        break;
-      default:
-        DBUG_ASSERT(0);
-        return;
+        if (all_converted)
+          cmp_type= INT_RESULT;
       }
     }
+    switch (cmp_type) {
+    case STRING_RESULT:
+      array=new in_string(arg_count-1,(qsort2_cmp) srtcmp_in, 
+                          cmp_collation.collation);
+      break;
+    case INT_RESULT:
+      array= new in_longlong(arg_count-1);
+      break;
+    case REAL_RESULT:
+      array= new in_double(arg_count-1);
+      break;
+    case ROW_RESULT:
+      /*
+        The row comparator was created at the beginning but only DATETIME
+        items comparators were initialized. Call store_value() to setup
+        others.
+      */
+      ((in_row*)array)->tmp.store_value(args[0]);
+      break;
+    case DECIMAL_RESULT:
+      array= new in_decimal(arg_count - 1);
+      break;
+    case TIME_RESULT:
+      date_arg= find_date_time_item(args, arg_count, 0);
+      array= new in_datetime(date_arg, arg_count - 1);
+      break;
+    case IMPOSSIBLE_RESULT:
+      DBUG_ASSERT(0);
+      break;
+    }
     if (array && !(thd->is_fatal_error))		// If not EOM
     {
       uint j=0;
@@ -4206,22 +3956,19 @@ void Item_func_in::fix_length_and_dec()
   }
   else
   {
-    if (compare_as_datetime)
-      cmp_items[STRING_RESULT]= new cmp_item_datetime(date_arg);
-    else
+    for (i= 0; i <= (uint) TIME_RESULT; i++)
     {
-      for (i= 0; i <= (uint) DECIMAL_RESULT; i++)
+      if (found_types & (1 << i) && !cmp_items[i])
       {
-        if (found_types & (1 << i) && !cmp_items[i])
-        {
-          if ((Item_result)i == STRING_RESULT &&
-              agg_arg_charsets_for_comparison(cmp_collation, args, arg_count))
-            return;
-          if (!cmp_items[i] && !(cmp_items[i]=
-              cmp_item::get_comparator((Item_result)i,
-                                       cmp_collation.collation)))
-            return;
-        }
+        if ((Item_result)i == STRING_RESULT &&
+            agg_arg_charsets_for_comparison(cmp_collation, args, arg_count))
+          return;
+        if ((Item_result)i == TIME_RESULT)
+          date_arg= find_date_time_item(args, arg_count, 0);
+        if (!cmp_items[i] && !(cmp_items[i]=
+            cmp_item::get_comparator((Item_result)i, date_arg,
+                                     cmp_collation.collation)))
+          return;
       }
     }
   }
@@ -4289,7 +4036,7 @@ longlong Item_func_in::val_int()
       have_null= TRUE;
       continue;
     }
-    Item_result cmp_type= item_cmp_type(left_result_type, args[i]->result_type());
+    Item_result cmp_type= item_cmp_type(left_result_type, args[i]->cmp_type());
     in_item= cmp_items[(uint)cmp_type];
     DBUG_ASSERT(in_item);
     if (!(value_added_map & (1 << (uint)cmp_type)))
@@ -4405,7 +4152,6 @@ Item_cond::fix_fields(THD *thd, Item **ref)
   */
   while ((item=li++))
   {
-    table_map tmp_table_map;
     while (item->type() == Item::COND_ITEM &&
 	   ((Item_cond*) item)->functype() == functype() &&
            !((Item_cond*) item)->list.is_empty())
@@ -4427,12 +4173,14 @@ Item_cond::fix_fields(THD *thd, Item **ref)
       and_tables_cache= (table_map) 0;
     else
     {
-      tmp_table_map= item->not_null_tables();
+      table_map tmp_table_map= item->not_null_tables();
       not_null_tables_cache|= tmp_table_map;
       and_tables_cache&= tmp_table_map;
       const_item_cache= FALSE;
-    }  
+    } 
+  
     with_sum_func=	    with_sum_func || item->with_sum_func;
+    with_field=             with_field || item->with_field;
     with_subselect|=        item->with_subselect;
     if (item->maybe_null)
       maybe_null=1;
@@ -4445,6 +4193,28 @@ Item_cond::fix_fields(THD *thd, Item **ref)
 }
 
 
+bool
+Item_cond::eval_not_null_tables(uchar *opt_arg)
+{
+  Item *item;
+  List_iterator<Item> li(list);
+  and_tables_cache= ~(table_map) 0;
+  while ((item=li++))
+  {
+    table_map tmp_table_map;
+    if (item->const_item())
+      and_tables_cache= (table_map) 0;
+    else
+    {
+      tmp_table_map= item->not_null_tables();
+      not_null_tables_cache|= tmp_table_map;
+      and_tables_cache&= tmp_table_map;
+    }
+  }
+  return 0;
+}
+
+
 void Item_cond::fix_after_pullout(st_select_lex *new_parent, Item **ref)
 {
   List_iterator<Item> li(list);
@@ -4788,12 +4558,6 @@ Item *and_expressions(Item *a, Item *b, Item **org_item)
 longlong Item_func_isnull::val_int()
 {
   DBUG_ASSERT(fixed == 1);
-  /*
-    Handle optimization if the argument can't be null
-    This has to be here because of the test in update_used_tables().
-  */
-  if (!used_tables_cache && !with_subselect)
-    return cached_value;
   return args[0]->is_null() ? 1: 0;
 }
 
@@ -4801,12 +4565,6 @@ longlong Item_is_not_null_test::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   DBUG_ENTER("Item_is_not_null_test::val_int");
-  if (!used_tables_cache && !with_subselect)
-  {
-    owner->was_null|= (!cached_value);
-    DBUG_PRINT("info", ("cached: %ld", (long) cached_value));
-    DBUG_RETURN(cached_value);
-  }
   if (args[0]->is_null())
   {
     DBUG_PRINT("info", ("null"));
@@ -4823,19 +4581,9 @@ longlong Item_is_not_null_test::val_int()
 void Item_is_not_null_test::update_used_tables()
 {
   if (!args[0]->maybe_null)
-  {
     used_tables_cache= 0;			/* is always true */
-    cached_value= (longlong) 1;
-  }
   else
-  {
     args[0]->update_used_tables();
-    if (!(used_tables_cache=args[0]->used_tables()) && !with_subselect)
-    {
-      /* Remember if the value is always NULL or never NULL */
-      cached_value= (longlong) !args[0]->is_null();
-    }
-  }
 }
 
 
@@ -5086,6 +4834,7 @@ Item_func_regex::fix_fields(THD *thd, Item **ref)
        args[1]->fix_fields(thd, args + 1)) || args[1]->check_cols(1))
     return TRUE;				/* purecov: inspected */
   with_sum_func=args[0]->with_sum_func || args[1]->with_sum_func;
+  with_field= args[0]->with_field || args[1]->with_field;
   max_length= 1;
   decimals= 0;
 
@@ -5406,23 +5155,21 @@ bool Item_func_like::turboBM_matches(const char* text, int text_len) const
     very fast to use.
 */
 
-longlong Item_cond_xor::val_int()
+longlong Item_func_xor::val_int()
 {
   DBUG_ASSERT(fixed == 1);
-  List_iterator<Item> li(list);
-  Item *item;
-  int result=0;	
-  null_value=0;
-  while ((item=li++))
+  int result= 0;
+  null_value= false;
+  for (uint i= 0; i < arg_count; i++)
   {
-    result^= (item->val_int() != 0);
-    if (item->null_value)
+    result^= (args[i]->val_int() != 0);
+    if (args[i]->null_value)
     {
-      null_value=1;
+      null_value= true;
       return 0;
     }
   }
-  return (longlong) result;
+  return result;
 }
 
 /**
@@ -5463,6 +5210,33 @@ Item *Item_bool_rowready_func2::neg_transformer(THD *thd)
   return item;
 }
 
+/**
+  XOR can be negated by negating one of the operands:
+
+  NOT (a XOR b)  => (NOT a) XOR b
+                 => a       XOR (NOT b)
+
+  @param thd     Thread handle
+  @return        New negated item
+*/
+Item *Item_func_xor::neg_transformer(THD *thd)
+{
+  Item *neg_operand;
+  Item_func_xor *new_item;
+  if ((neg_operand= args[0]->neg_transformer(thd)))
+    // args[0] has neg_tranformer
+    new_item= new(thd->mem_root) Item_func_xor(neg_operand, args[1]);
+  else if ((neg_operand= args[1]->neg_transformer(thd)))
+    // args[1] has neg_tranformer
+    new_item= new(thd->mem_root) Item_func_xor(args[0], neg_operand);
+  else
+  {
+    neg_operand= new(thd->mem_root) Item_func_not(args[0]);
+    new_item= new(thd->mem_root) Item_func_xor(neg_operand, args[1]);
+  }
+  return new_item;
+}
+
 
 /**
   a IS NULL  ->  a IS NOT NULL.
@@ -5507,7 +5281,7 @@ Item *Item_func_nop_all::neg_transformer(THD *thd)
   /* "NOT (e $cmp$ ANY (SELECT ...)) -> e $rev_cmp$" ALL (SELECT ...) */
   Item_func_not_all *new_item= new Item_func_not_all(args[0]);
   Item_allany_subselect *allany= (Item_allany_subselect*)args[0];
-  allany->func= allany->func_creator(FALSE);
+  allany->create_comp_func(FALSE);
   allany->all= !allany->all;
   allany->upper_item= new_item;
   return new_item;
@@ -5519,7 +5293,7 @@ Item *Item_func_not_all::neg_transformer(THD *thd)
   Item_func_nop_all *new_item= new Item_func_nop_all(args[0]);
   Item_allany_subselect *allany= (Item_allany_subselect*)args[0];
   allany->all= !allany->all;
-  allany->func= allany->func_creator(TRUE);
+  allany->create_comp_func(TRUE);
   allany->upper_item= new_item;
   return new_item;
 }
@@ -5568,43 +5342,92 @@ Item *Item_bool_rowready_func2::negated_item()
   return 0;
 }
 
-Item_equal::Item_equal(Item_field *f1, Item_field *f2)
-  : Item_bool_func(), const_item(0), eval_item(0), cond_false(0),
-    compare_as_dates(FALSE)
-{
-  const_item_cache= 0;
-  fields.push_back(f1);
-  fields.push_back(f2);
-}
 
-Item_equal::Item_equal(Item *c, Item_field *f)
+/**
+  Construct a minimal multiple equality item
+
+  @param f1               the first equal item
+  @param f2               the second equal item
+  @param with_const_item  TRUE if the first item is constant
+
+  @details
+  The constructor builds a new item equal object for the equality f1=f2.
+  One of the equal items can be constant. If this is the case it is passed
+  always as the first parameter and the parameter with_const_item serves
+  as an indicator of this case.
+  Currently any non-constant parameter items must point to an item of the
+  of the type Item_field or Item_direct_view_ref(Item_field). 
+*/
+
+Item_equal::Item_equal(Item *f1, Item *f2, bool with_const_item)
   : Item_bool_func(), eval_item(0), cond_false(0)
 {
   const_item_cache= 0;
-  fields.push_back(f);
-  const_item= c;
-  compare_as_dates= f->is_datetime();
+  with_const= with_const_item;
+  equal_items.push_back(f1);
+  equal_items.push_back(f2);
+  compare_as_dates= with_const_item && f2->cmp_type() == TIME_RESULT;
 }
 
 
+/**
+  Copy constructor for a multiple equality
+  
+  @param item_equal   source item for the constructor
+
+  @details
+  The function creates a copy of an Item_equal object.
+  This constructor is used when an item belongs to a multiple equality
+  of an upper level (an upper AND/OR level or an upper level of a nested
+  outer join).
+*/
+
 Item_equal::Item_equal(Item_equal *item_equal)
   : Item_bool_func(), eval_item(0), cond_false(0)
 {
   const_item_cache= 0;
-  List_iterator_fast<Item_field> li(item_equal->fields);
-  Item_field *item;
+  List_iterator_fast<Item> li(item_equal->equal_items);
+  Item *item;
   while ((item= li++))
   {
-    fields.push_back(item);
+    equal_items.push_back(item);
   }
-  const_item= item_equal->const_item;
+  with_const= item_equal->with_const;
   compare_as_dates= item_equal->compare_as_dates;
   cond_false= item_equal->cond_false;
 }
 
 
-void Item_equal::compare_const(Item *c)
+/*
+  @brief
+  Add a constant item to the Item_equal object
+
+  @param[in]  c  the constant to add
+  @param[in]  f  item from the list equal_items the item c is equal to
+                 (this parameter is optional)
+
+  @details
+  The method adds the constant item c to the equal_items list. If the list
+  doesn't have any constant item yet the item c is just put in the front
+  the list. Otherwise the value of c is compared with the value of the
+  constant item from equal_items. If they are not equal cond_false is set
+  to TRUE. This serves as an indicator that this Item_equal is always FALSE.
+  The optional parameter f is used to adjust the flag compare_as_dates.
+*/
+
+void Item_equal::add_const(Item *c, Item *f)
 {
+  if (cond_false)
+    return;
+  if (!with_const)
+  {
+    with_const= TRUE;
+    if (f)
+      compare_as_dates= f->cmp_type() == TIME_RESULT;
+    equal_items.push_front(c);
+    return;
+  }
+  Item *const_item= get_const();
   if (compare_as_dates)
   {
     cmp.set_datetime_cmp_func(this, &c, &const_item);
@@ -5621,65 +5444,28 @@ void Item_equal::compare_const(Item *c)
     const_item_cache= 1;
 }
 
-
-void Item_equal::add(Item *c, Item_field *f)
-{
-  if (cond_false)
-    return;
-  if (!const_item)
-  {
-    DBUG_ASSERT(f);
-    const_item= c;
-    compare_as_dates= f->is_datetime();
-    return;
-  }
-  compare_const(c);
-}
-
-
-void Item_equal::add(Item *c)
-{
-  if (cond_false)
-    return;
-  if (!const_item)
-  {
-    const_item= c;
-    return;
-  }
-  compare_const(c);
-}
-
-void Item_equal::add(Item_field *f)
-{
-  fields.push_back(f);
-}
-
-uint Item_equal::members()
-{
-  return fields.elements;
-}
-
-
 /**
-  Check whether a field is referred in the multiple equality.
-
-  The function checks whether field is occurred in the Item_equal object .
+  @brief
+  Check whether a field is referred to in the multiple equality
 
   @param field   field whose occurrence is to be checked
 
+  @details
+  The function checks whether field is referred to by one of the
+  items from the equal_items list.
+
   @retval
-    1       if nultiple equality contains a reference to field
+    1       if multiple equality contains a reference to field
   @retval
     0       otherwise    
 */
 
 bool Item_equal::contains(Field *field)
 {
-  List_iterator_fast<Item_field> it(fields);
-  Item_field *item;
-  while ((item= it++))
+  Item_equal_fields_iterator it(*this);
+  while (it++)
   {
-    if (field->eq(item->field))
+    if (field->eq(it.get_curr_field()))
         return 1;
   }
   return 0;
@@ -5687,73 +5473,93 @@ bool Item_equal::contains(Field *field)
 
 
 /**
-  Join members of another Item_equal object.
+  @brief
+  Join members of another Item_equal object
   
-    The function actually merges two multiple equalities.
-    After this operation the Item_equal object additionally contains
-    the field items of another item of the type Item_equal.
-    If the optional constant items are not equal the cond_false flag is
-    set to 1.  
   @param item    multiple equality whose members are to be joined
+
+  @details
+  The function actually merges two multiple equalities. After this operation
+  the Item_equal object additionally contains the field items of another item of
+  the type Item_equal.
+  If the optional constant items are not equal the cond_false flag is set to TRUE.
+
+  @notes
+  The function is called for any equality f1=f2 such that f1 and f2 are items
+  of the type Item_field or Item_direct_view_ref(Item_field), and, f1->field is
+  referred to in the list this->equal_items, while the list item->equal_items
+  contains a reference to f2->field.  
 */
 
 void Item_equal::merge(Item_equal *item)
 {
-  fields.concat(&item->fields);
-  Item *c= item->const_item;
+  Item *c= item->get_const();
+  if (c)
+    item->equal_items.pop();
+  equal_items.concat(&item->equal_items);
   if (c)
   {
     /* 
-      The flag cond_false will be set to 1 after this, if 
+      The flag cond_false will be set to TRUE after this if 
       the multiple equality already contains a constant and its 
-      value is  not equal to the value of c.
+      value is not equal to the value of c.
     */
-    add(c);
+    add_const(c);
   }
   cond_false|= item->cond_false;
 } 
 
 
 /**
-  Order field items in multiple equality according to a sorting criteria.
+  @brief
+  Order equal items of the  multiple equality according to a sorting criteria
 
-  The function perform ordering of the field items in the Item_equal
-  object according to the criteria determined by the cmp callback parameter.
-  If cmp(item_field1,item_field2,arg)<0 than item_field1 must be
-  placed after item_fiel2.
+  @param compare      function to compare items from the equal_items list
+  @param arg          context extra parameter for the cmp function
+
+  @details
+  The function performs ordering of the items from the equal_items list
+  according to the criteria determined by the cmp callback parameter.
+  If cmp(item1,item2,arg)<0 than item1 must be placed after item2.
 
-  The function sorts field items by the exchange sort algorithm.
+  @notes
+  The function sorts equal items by the bubble sort algorithm.
   The list of field items is looked through and whenever two neighboring
   members follow in a wrong order they are swapped. This is performed
   again and again until we get all members in a right order.
-
-  @param compare      function to compare field item
-  @param arg          context extra parameter for the cmp function
 */
 
 void Item_equal::sort(Item_field_cmpfunc compare, void *arg)
 {
-  exchange_sort<Item_field>(&fields, compare, arg);
+  bubble_sort<Item>(&equal_items, compare, arg);
 }
 
 
 /**
-  Check appearance of new constant items in the multiple equality object.
+  @brief
+  Check appearance of new constant items in the multiple equality object
 
-  The function checks appearance of new constant items among
-  the members of multiple equalities. Each new constant item is
-  compared with the designated constant item if there is any in the
-  multiple equality. If there is none the first new constant item
-  becomes designated.
+  @details
+  The function checks appearance of new constant items among the members
+  of the equal_items list. Each new constant item is compared with
+  the constant item from the list if there is any. If there is none the first
+  new constant item is placed at the very beginning of the list and
+  with_const is set to TRUE. If it happens that the compared constant items
+  are unequal then the flag cond_false is set to TRUE.
+
+  @notes 
+  Currently this function is called only after substitution of constant tables.
 */
 
 void Item_equal::update_const()
 {
-  List_iterator<Item_field> it(fields);
+  List_iterator<Item> it(equal_items);
+  if (with_const)
+    it++;
   Item *item;
   while ((item= it++))
   {
-    if (item->const_item() &&
+    if (item->const_item() && !item->is_expensive() &&
         /*
           Don't propagate constant status of outer-joined column.
           Such a constant status here is a result of:
@@ -5769,41 +5575,81 @@ void Item_equal::update_const()
         */
         !item->is_outer_field())
     {
-      it.remove();
-      add(item);
-    }
+      if (item == equal_items.head())
+        with_const= TRUE;
+      else
+      {
+        it.remove();
+        add_const(item);
+      }
+    } 
   }
 }
 
+
+/**
+  @brief
+  Fix fields in a completely built multiple equality
+
+  @param  thd     currently not used thread handle 
+  @param  ref     not used
+
+  @details
+  This function is called once the multiple equality has been built out of 
+  the WHERE/ON condition and no new members are expected to be added to the
+  equal_items list anymore.
+  As any implementation of the virtual fix_fields method the function
+  calculates the cached values of not_null_tables_cache, used_tables_cache,
+  const_item_cache and calls fix_length_and_dec().
+  Additionally the function sets a reference to the Item_equal object in
+  the non-constant items of the equal_items list unless such a reference has
+  been already set.
+
+  @notes 
+  Currently this function is called only in the function
+  build_equal_items_for_cond.
+  
+  @retval
+  FALSE   always
+*/
+
 bool Item_equal::fix_fields(THD *thd, Item **ref)
-{
-  List_iterator_fast<Item_field> li(fields);
+{ 
+  DBUG_ASSERT(fixed == 0);
+  Item_equal_fields_iterator it(*this);
   Item *item;
   not_null_tables_cache= used_tables_cache= 0;
   const_item_cache= 0;
-  while ((item= li++))
+  while ((item= it++))
   {
     table_map tmp_table_map;
     used_tables_cache|= item->used_tables();
     tmp_table_map= item->not_null_tables();
     not_null_tables_cache|= tmp_table_map;
     if (item->maybe_null)
-      maybe_null=1;
+      maybe_null= 1;
+    if (!item->get_item_equal())
+      item->set_item_equal(this);
   }
   fix_length_and_dec();
   fixed= 1;
-  return 0;
+  return FALSE;
 }
 
+
+/**
+  Update the value of the used table attribute and other attributes
+ */
+
 void Item_equal::update_used_tables()
 {
-  List_iterator_fast<Item_field> li(fields);
-  Item *item;
   not_null_tables_cache= used_tables_cache= 0;
   if ((const_item_cache= cond_false))
     return;
+  Item_equal_fields_iterator it(*this);
+  Item *item;
   const_item_cache= 1;
-  while ((item=li++))
+  while ((item= it++))
   {
     item->update_used_tables();
     used_tables_cache|= item->used_tables();
@@ -5812,23 +5658,47 @@ void Item_equal::update_used_tables()
   }
 }
 
+
+
+/**
+  @brief
+  Evaluate multiple equality
+
+  @details
+  The function evaluate multiple equality to a boolean value.
+  The function ignores non-constant items from the equal_items list.
+  The function returns 1 if all constant items from the list are equal. 
+  It returns 0 if there are unequal constant items in the list or 
+  one of the constant items is evaluated to NULL. 
+  
+  @notes 
+  Currently this function can be called only at the optimization
+  stage after the constant table substitution, since all Item_equals
+  are eliminated before the execution stage.
+  
+  @retval
+     0     multiple equality is always FALSE or NULL
+     1     otherwise
+*/
+
 longlong Item_equal::val_int()
 {
-  Item_field *item_field;
   if (cond_false)
     return 0;
-  List_iterator_fast<Item_field> it(fields);
-  Item *item= const_item ? const_item : it++;
+  Item *item= get_const();
+  Item_equal_fields_iterator it(*this);
+  if (!item)
+    item= it++;
   eval_item->store_value(item);
   if ((null_value= item->null_value))
     return 0;
-  while ((item_field= it++))
+  while ((item= it++))
   {
+    Field *field= it.get_curr_field();
     /* Skip fields of non-const tables. They haven't been read yet */
-    if (item_field->field->table->const_table)
+    if (field->table->const_table)
     {
-      int res= eval_item->cmp(item_field);
-      if ((null_value= item_field->null_value) || res)
+      if (eval_item->cmp(item) || (null_value= item->null_value))
         return 0;
     }
   }
@@ -5839,14 +5709,15 @@ longlong Item_equal::val_int()
 void Item_equal::fix_length_and_dec()
 {
   Item *item= get_first(NULL);
-  eval_item= cmp_item::get_comparator(item->result_type(),
+  eval_item= cmp_item::get_comparator(item->result_type(), 0,
                                       item->collation.collation);
 }
 
+
 bool Item_equal::walk(Item_processor processor, bool walk_subquery, uchar *arg)
 {
-  List_iterator_fast<Item_field> it(fields);
   Item *item;
+  Item_equal_fields_iterator it(*this);
   while ((item= it++))
   {
     if (item->walk(processor, walk_subquery, arg))
@@ -5855,12 +5726,13 @@ bool Item_equal::walk(Item_processor processor, bool walk_subquery, uchar *arg)
   return Item_func::walk(processor, walk_subquery, arg);
 }
 
+
 Item *Item_equal::transform(Item_transformer transformer, uchar *arg)
 {
   DBUG_ASSERT(!current_thd->stmt_arena->is_stmt_prepare());
 
-  List_iterator<Item_field> it(fields);
   Item *item;
+  Item_equal_fields_iterator it(*this);
   while ((item= it++))
   {
     Item *new_item= item->transform(transformer, arg);
@@ -5879,19 +5751,20 @@ Item *Item_equal::transform(Item_transformer transformer, uchar *arg)
   return Item_func::transform(transformer, arg);
 }
 
+
 void Item_equal::print(String *str, enum_query_type query_type)
 {
+  if (cond_false)
+  {
+    str->append('0');
+    return;
+  }
   str->append(func_name());
   str->append('(');
-  List_iterator_fast<Item_field> it(fields);
+  List_iterator_fast<Item> it(equal_items);
   Item *item;
-  if (const_item)
-    const_item->print(str, query_type);
-  else
-  {
-    item= it++;
-    item->print(str, query_type);
-  }
+  item= it++;
+  item->print(str, query_type);
   while ((item= it++))
   {
     str->append(',');
@@ -5902,6 +5775,14 @@ void Item_equal::print(String *str, enum_query_type query_type)
 }
 
 
+CHARSET_INFO *Item_equal::compare_collation()
+{ 
+  Item_equal_fields_iterator it(*this);
+  Item *item= it++;
+  return item->collation.collation;
+}
+
+
 /*
   @brief Get the first equal field of multiple equality.
   @param[in] field   the field to get equal field to
@@ -5927,13 +5808,14 @@ void Item_equal::print(String *str, enum_query_type query_type)
   @retval 0 if no field found.
 */
 
-Item_field* Item_equal::get_first(Item_field *field)
+Item* Item_equal::get_first(Item *field_item)
 {
-  List_iterator<Item_field> it(fields);
-  Item_field *item;
+  Item_equal_fields_iterator it(*this);
+  Item *item;
   JOIN_TAB *field_tab;
-  if (!field)
-    return fields.head();
+  if (!field_item)
+    return (it++);
+  Field *field= ((Item_field *) (field_item->real_item()))->field;
 
   /*
     Of all equal fields, return the first one we can use. Normally, this is the
@@ -5955,73 +5837,87 @@ Item_field* Item_equal::get_first(Item_field *field)
     in presense of SJM nests.
   */
 
-  field_tab= field->field->table->reginfo.join_tab;
+  field_tab= field->table->reginfo.join_tab;
 
-  TABLE_LIST *emb_nest= field->field->table->pos_in_table_list->embedding;
+  TABLE_LIST *emb_nest= field->table->pos_in_table_list->embedding;
 
   if (emb_nest && emb_nest->sj_mat_info && emb_nest->sj_mat_info->is_used)
   {
     /*
       It's a field from an materialized semi-join. We can substitute it only
-      for a field from the same semi-join.
+      for a field from the same semi-join. Find the first of such items.
     */
-    JOIN_TAB *first;
-    JOIN *join= field_tab->join;
-    int tab_idx= field_tab - field_tab->join->join_tab;
 
-    /* Find the first table of this semi-join nest */
-    for (int i= tab_idx; i >= (int)join->const_tables; i--)
-    {
-      if (join->join_tab[i].table->map & emb_nest->sj_inner_tables)
-        first= join->join_tab + i;
-      else
-        // Found first tab that doesn't belong to current SJ.
-        break;
-    }
-    /* Find an item to substitute for. */
     while ((item= it++))
     {
-      if (item->field->table->reginfo.join_tab >= first)
+      if (it.get_curr_field()->table->pos_in_table_list->embedding == emb_nest)
       {
         /*
           If we found given field then return NULL to avoid unnecessary
           substitution.
         */
-        return (item != field) ? item : NULL;
+        return (item != field_item) ? item : NULL;
       }
     }
   }
   else
   {
-#if 0    
     /*
       The field is not in SJ-Materialization nest. We must return the first
-      field that's not embedded in a SJ-Materialization nest.
-      Example: suppose we have a join order:
+      field in the join order. The field may be inside a semi-join nest, i.e 
+      a join order may look like this:
 
           SJ-Mat(it1  it2)  ot1  ot2
 
-      and equality ot2.col = ot1.col = it2.col
-      If we're looking for best substitute for 'ot2.col', we should pick ot1.col
-      and not it2.col, because when we run a join between ot1 and ot2
-      execution of SJ-Mat(...) has already finished and we can't rely on the
-      value of it*.*.
-      psergey-fix-fix: ^^ THAT IS INCORRECT ^^. Pick the first, whatever that
-      is.
+      where we're looking what to substitute ot2.col for. In this case we must 
+      still return it1.col, here's a proof why:
+
+      First let's note that either it1.col or it2.col participates in 
+      subquery's IN-equality. It can't be otherwise, because materialization is
+      only applicable to uncorrelated subqueries, so the only way we could
+      infer "it1.col=ot1.col" is from the IN-equality. Ok, so IN-eqality has 
+      it1.col or it2.col on its inner side. it1.col is first such item in the
+      join order, so it's not possible for SJ-Mat to be
+      SJ-Materialization-lookup, it is SJ-Materialization-Scan. The scan part
+      of this strategy will unpack value of it1.col=it2.col into it1.col
+      (that's the first equal item inside the subquery), and we'll be able to
+      get it from there. qed.
     */
-    while ((item= it++))
-    {
-      TABLE_LIST *emb_nest= item->field->table->pos_in_table_list->embedding;
-      if (!emb_nest || !emb_nest->sj_mat_info || 
-          !emb_nest->sj_mat_info->is_used)
-      {
-        return item;
-      }
-    }
-#endif
-    return fields.head();
+
+    return equal_items.head();
   }
   // Shouldn't get here.
   DBUG_ASSERT(0);
   return NULL;
 }
+
+
+longlong Item_func_dyncol_exists::val_int()
+{
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+  DYNAMIC_COLUMN col;
+  String *str;
+  ulonglong num;
+  enum enum_dyncol_func_result rc;
+
+  num= args[1]->val_int();
+  str= args[0]->val_str(&tmp);
+  if (args[0]->null_value || args[1]->null_value || num > UINT_MAX16)
+    goto null;
+  col.length= str->length();
+  /* We do not change the string, so could do this trick */
+  col.str= (char *)str->ptr();
+  rc= dynamic_column_exists(&col, (uint) num);
+  if (rc < 0)
+  {
+    dynamic_column_error_message(rc);
+    goto null;
+  }
+  null_value= FALSE;
+  return rc == ER_DYNCOL_YES;
+
+null:
+  null_value= TRUE;
+  return 0;
+}
diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h
index 69b22fd66d5..67035384cf7 100644
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -1,7 +1,7 @@
 #ifndef ITEM_CMPFUNC_INCLUDED
 #define ITEM_CMPFUNC_INCLUDED
-
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,39 +33,29 @@ class Arg_comparator;
 
 typedef int (Arg_comparator::*arg_cmp_func)();
 
-typedef int (*Item_field_cmpfunc)(Item_field *f1, Item_field *f2, void *arg); 
+typedef int (*Item_field_cmpfunc)(Item *f1, Item *f2, void *arg); 
 
 class Arg_comparator: public Sql_alloc
 {
   Item **a, **b;
   arg_cmp_func func;
   Item_result_field *owner;
+  bool set_null;                   // TRUE <=> set owner->null_value
   Arg_comparator *comparators;   // used only for compare_row()
   double precision;
   /* Fields used in DATE/DATETIME comparison. */
   THD *thd;
-  enum_field_types a_type, b_type; // Types of a and b items
   Item *a_cache, *b_cache;         // Cached values of a and b items
-  bool is_nulls_eq;                // TRUE <=> compare for the EQUAL_FUNC
-  bool set_null;                   // TRUE <=> set owner->null_value
                                    //   when one of arguments is NULL.
-  enum enum_date_cmp_type { CMP_DATE_DFLT= 0, CMP_DATE_WITH_DATE,
-                            CMP_DATE_WITH_STR, CMP_STR_WITH_DATE };
-  longlong (*get_value_a_func)(THD *thd, Item ***item_arg, Item **cache_arg,
-                               Item *warn_item, bool *is_null);
-  longlong (*get_value_b_func)(THD *thd, Item ***item_arg, Item **cache_arg,
-                               Item *warn_item, bool *is_null);
-  bool try_year_cmp_func(Item_result type);
 public:
   DTCollation cmp_collation;
   /* Allow owner function to use string buffers. */
   String value1, value2;
 
-  Arg_comparator(): comparators(0), thd(0), a_cache(0), b_cache(0), set_null(TRUE),
-    get_value_a_func(0), get_value_b_func(0) {};
-  Arg_comparator(Item **a1, Item **a2): a(a1), b(a2), comparators(0), thd(0),
-    a_cache(0), b_cache(0), set_null(TRUE),
-    get_value_a_func(0), get_value_b_func(0) {};
+  Arg_comparator():  set_null(TRUE), comparators(0), thd(0),
+    a_cache(0), b_cache(0) {};
+  Arg_comparator(Item **a1, Item **a2): a(a1), b(a2),  set_null(TRUE),
+    comparators(0), thd(0), a_cache(0), b_cache(0) {};
 
   int set_compare_func(Item_result_field *owner, Item_result type);
   inline int set_compare_func(Item_result_field *owner_arg)
@@ -82,8 +72,8 @@ public:
   {
     set_null= set_null_arg;
     return set_cmp_func(owner_arg, a1, a2,
-                        item_cmp_type((*a1)->result_type(),
-                                      (*a2)->result_type()));
+                        item_cmp_type((*a1)->cmp_type(),
+                                      (*a2)->cmp_type()));
   }
   inline int compare() { return (this->*func)(); }
 
@@ -106,14 +96,12 @@ public:
   int compare_real_fixed();
   int compare_e_real_fixed();
   int compare_datetime();        // compare args[0] & args[1] as DATETIMEs
-
-  static enum enum_date_cmp_type can_compare_as_dates(Item *a, Item *b,
-                                                      ulonglong *const_val_arg);
+  int compare_e_datetime();
 
   Item** cache_converted_constant(THD *thd, Item **value, Item **cache,
                                   Item_result type);
   void set_datetime_cmp_func(Item_result_field *owner_arg, Item **a1, Item **b1);
-  static arg_cmp_func comparator_matrix [5][2];
+  static arg_cmp_func comparator_matrix [6][2];
   inline bool is_owner_equal_func()
   {
     return (owner->type() == Item::FUNC_ITEM &&
@@ -124,17 +112,6 @@ public:
     delete [] comparators;
     comparators= 0;
   }
-  /*
-    Set correct cmp_context if items would be compared as INTs.
-  */
-  inline void set_cmp_context_for_datetime()
-  {
-    DBUG_ASSERT(func == &Arg_comparator::compare_datetime);
-    if ((*a)->result_as_longlong())
-      (*a)->cmp_context= INT_RESULT;
-    if ((*b)->result_as_longlong())
-      (*b)->cmp_context= INT_RESULT;
-  }
   friend class Item_func;
 };
 
@@ -283,6 +260,13 @@ public:
   void keep_top_level_cache();
   Item *transform(Item_transformer transformer, uchar *arg);
   virtual Item *expr_cache_insert_transformer(uchar *thd_arg);
+  bool is_expensive_processor(uchar *arg);
+  bool is_expensive();
+  void set_join_tab_idx(uint join_tab_idx_arg)
+  { args[1]->set_join_tab_idx(join_tab_idx_arg); }
+  virtual void get_cache_parameters(List<Item> &parameters);
+  bool is_top_level_item();
+  bool eval_not_null_tables(uchar *opt_arg);
 };
 
 class Comp_creator
@@ -290,7 +274,14 @@ class Comp_creator
 public:
   Comp_creator() {}                           /* Remove gcc warning */
   virtual ~Comp_creator() {}                  /* Remove gcc warning */
+  /**
+    Create operation with given arguments.
+  */
   virtual Item_bool_func2* create(Item *a, Item *b) const = 0;
+  /**
+    Create operation with given arguments in swap order.
+  */
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const = 0;
   virtual const char* symbol(bool invert) const = 0;
   virtual bool eqne_op() const = 0;
   virtual bool l_op() const = 0;
@@ -302,6 +293,7 @@ public:
   Eq_creator() {}                             /* Remove gcc warning */
   virtual ~Eq_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? "<>" : "="; }
   virtual bool eqne_op() const { return 1; }
   virtual bool l_op() const { return 0; }
@@ -313,6 +305,7 @@ public:
   Ne_creator() {}                             /* Remove gcc warning */
   virtual ~Ne_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? "=" : "<>"; }
   virtual bool eqne_op() const { return 1; }
   virtual bool l_op() const { return 0; }
@@ -324,6 +317,7 @@ public:
   Gt_creator() {}                             /* Remove gcc warning */
   virtual ~Gt_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? "<=" : ">"; }
   virtual bool eqne_op() const { return 0; }
   virtual bool l_op() const { return 0; }
@@ -335,6 +329,7 @@ public:
   Lt_creator() {}                             /* Remove gcc warning */
   virtual ~Lt_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? ">=" : "<"; }
   virtual bool eqne_op() const { return 0; }
   virtual bool l_op() const { return 1; }
@@ -346,6 +341,7 @@ public:
   Ge_creator() {}                             /* Remove gcc warning */
   virtual ~Ge_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? "<" : ">="; }
   virtual bool eqne_op() const { return 0; }
   virtual bool l_op() const { return 0; }
@@ -357,6 +353,7 @@ public:
   Le_creator() {}                             /* Remove gcc warning */
   virtual ~Le_creator() {}                    /* Remove gcc warning */
   virtual Item_bool_func2* create(Item *a, Item *b) const;
+  virtual Item_bool_func2* create_swap(Item *a, Item *b) const;
   virtual const char* symbol(bool invert) const { return invert? ">" : "<="; }
   virtual bool eqne_op() const { return 0; }
   virtual bool l_op() const { return 1; }
@@ -409,7 +406,30 @@ public:
   }
   Item *neg_transformer(THD *thd);
   virtual Item *negated_item();
-  bool subst_argument_checker(uchar **arg) { return TRUE; }
+  bool subst_argument_checker(uchar **arg)
+  {
+    return (*arg != NULL);     
+  }
+};
+
+/**
+  XOR inherits from Item_bool_func2 because it is not optimized yet.
+  Later, when XOR is optimized, it needs to inherit from
+  Item_cond instead. See WL#5800. 
+*/
+class Item_func_xor :public Item_bool_func2
+{
+public:
+  Item_func_xor(Item *i1, Item *i2) :Item_bool_func2(i1, i2) {}
+  enum Functype functype() const { return XOR_FUNC; }
+  const char *func_name() const { return "xor"; }
+  longlong val_int();
+  void top_level_item() {}
+  Item *neg_transformer(THD *thd);
+  bool subst_argument_checker(uchar **arg)
+  {
+    return (*arg != NULL);     
+  }
 };
 
 class Item_func_not :public Item_bool_func
@@ -481,7 +501,7 @@ public:
      show(0)
     {}
   virtual void top_level_item() { abort_on_null= 1; }
-  bool top_level() { return abort_on_null; }
+  bool is_top_level_item() { return abort_on_null; }
   longlong val_int();
   enum Functype functype() const { return NOT_ALL_FUNC; }
   const char *func_name() const { return "<not>"; }
@@ -645,9 +665,7 @@ public:
   Item_result cmp_type;
   String value0,value1,value2;
   /* TRUE <=> arguments will be compared as dates. */
-  bool compare_as_dates;
-  /* Comparators used for DATE/DATETIME comparison. */
-  Arg_comparator ge_cmp, le_cmp;
+  Item *compare_as_dates;
   Item_func_between(Item *a, Item *b, Item *c)
     :Item_func_opt_neg(a, b, c), compare_as_dates(FALSE) {}
   longlong val_int();
@@ -660,6 +678,7 @@ public:
   bool is_bool_func() { return 1; }
   CHARSET_INFO *compare_collation() { return cmp_collation.collation; }
   uint decimal_precision() const { return 1; }
+  bool eval_not_null_tables(uchar *opt_arg);
 };
 
 
@@ -725,6 +744,7 @@ public:
   const char *func_name() const { return "coalesce"; }
   table_map not_null_tables() const { return 0; }
   enum_field_types field_type() const { return cached_field_type; }
+  bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
 };
 
 
@@ -764,6 +784,7 @@ public:
   void fix_length_and_dec();
   uint decimal_precision() const;
   const char *func_name() const { return "if"; }
+  bool eval_not_null_tables(uchar *opt_arg);
 };
 
 
@@ -930,6 +951,16 @@ public:
      lval_cache(0) {};
   void set(uint pos,Item *item);
   uchar *get_value(Item *item);
+  Item* create_item()
+  { 
+    return new Item_datetime();
+  }
+  void value_to_item(uint pos, Item *item)
+  {
+    packed_longlong *val= reinterpret_cast<packed_longlong*>(base)+pos;
+    Item_datetime *dt= reinterpret_cast<Item_datetime*>(item);
+    dt->set(val->val);
+  }
   friend int cmp_longlong(void *cmp_arg, packed_longlong *a,packed_longlong *b);
 };
 
@@ -989,7 +1020,8 @@ public:
   virtual int cmp(Item *item)= 0;
   // for optimized IN with row
   virtual int compare(cmp_item *item)= 0;
-  static cmp_item* get_comparator(Item_result type, CHARSET_INFO *cs);
+  static cmp_item* get_comparator(Item_result type, Item * warn_item,
+                                  CHARSET_INFO *cs);
   virtual cmp_item *make_same()= 0;
   virtual void store_value_by_template(cmp_item *tmpl, Item *item)
   {
@@ -1185,7 +1217,7 @@ class Item_func_case :public Item_func
   Item_result cmp_type;
   DTCollation cmp_collation;
   enum_field_types cached_field_type;
-  cmp_item *cmp_items[5]; /* For all result types */
+  cmp_item *cmp_items[6]; /* For all result types */
   cmp_item *case_item;
 public:
   Item_func_case(List<Item> &list, Item *first_expr_arg, Item *else_expr_arg)
@@ -1275,7 +1307,7 @@ public:
     Item_int_func::cleanup();
     delete array;
     array= 0;
-    for (i= 0; i <= (uint)DECIMAL_RESULT + 1; i++)
+    for (i= 0; i <= (uint)TIME_RESULT; i++)
     {
       delete cmp_items[i];
       cmp_items[i]= 0;
@@ -1290,6 +1322,7 @@ public:
   bool nulls_in_row();
   bool is_bool_func() { return 1; }
   CHARSET_INFO *compare_collation() { return cmp_collation.collation; }
+  bool eval_not_null_tables(uchar *opt_arg);
 };
 
 class cmp_item_row :public cmp_item
@@ -1325,8 +1358,6 @@ public:
 
 class Item_func_isnull :public Item_bool_func
 {
-protected:
-  longlong cached_value;
 public:
   Item_func_isnull(Item *a) :Item_bool_func(a) {}
   longlong val_int();
@@ -1344,17 +1375,12 @@ public:
     {
       used_tables_cache= 0;			/* is always false */
       const_item_cache= 1;
-      cached_value= (longlong) 0;
     }
     else
     {
       args[0]->update_used_tables();
-      if ((const_item_cache= !(used_tables_cache= args[0]->used_tables()) &&
-          !with_subselect))
-      {
-	/* Remember if the value is always NULL or never NULL */
-	cached_value= (longlong) args[0]->is_null();
-      }
+      used_tables_cache= args[0]->used_tables();
+      const_item_cache= args[0]->const_item();
     }
   }
   table_map not_null_tables() const { return 0; }
@@ -1387,6 +1413,7 @@ public:
   */
   table_map used_tables() const
     { return used_tables_cache | RAND_TABLE_BIT; }
+  bool const_item() const { return FALSE; }
 };
 
 
@@ -1536,6 +1563,7 @@ public:
   bool subst_argument_checker(uchar **arg) { return TRUE; }
   Item *compile(Item_analyzer analyzer, uchar **arg_p,
                 Item_transformer transformer, uchar *arg_t);
+  bool eval_not_null_tables(uchar *opt_arg);
 };
 
 
@@ -1616,28 +1644,64 @@ public:
 
 class Item_equal: public Item_bool_func
 {
-  List<Item_field> fields; /* list of equal field items                    */
-  Item *const_item;        /* optional constant item equal to fields items */
+  /*
+    The list of equal items. Currently the list can contain:
+     - Item_fields items for references to table columns
+     - Item_direct_view_ref items for references to view columns
+     - one const item
+
+    If the list contains a constant item this item is always first in the list.
+    The list contains at least two elements.
+    Currently all Item_fields/Item_direct_view_ref items in the list should
+    refer to table columns with equavalent type definitions. In particular
+    if these are string columns they should have the same charset/collation.
+
+    Use objects of the companion class Item_equal_fields_iterator to iterate
+    over all items from the list of the Item_field/Item_direct_view_ref classes.
+  */ 
+  List<Item> equal_items; 
+  /* 
+     TRUE <-> one of the items is a const item.
+     Such item is always first in in the equal_items list
+  */
+  bool with_const;        
+  /* 
+    The field eval_item is used when this item is evaluated
+    with the method val_int()
+  */  
   cmp_item *eval_item;
-  Arg_comparator cmp;
+  /*
+    This initially is set to FALSE. It becomes TRUE when this item is evaluated
+    as being always false. If the flag is TRUE the contents of the list 
+    the equal_items should be ignored.
+  */
   bool cond_false;
+  /* 
+    compare_as_dates=TRUE <-> constants equal to fields from equal_items
+    must be compared as datetimes and not as strings.
+    compare_as_dates can be TRUE only if with_const=TRUE 
+  */
   bool compare_as_dates;
+  /* 
+    The comparator used to compare constants equal to fields from equal_items
+    as datetimes. The comparator is used only if compare_as_dates=TRUE
+  */
+  Arg_comparator cmp;
 public:
   inline Item_equal()
-    : Item_bool_func(), const_item(0), eval_item(0), cond_false(0)
+    : Item_bool_func(), with_const(FALSE), eval_item(0), cond_false(0)
   { const_item_cache=0 ;}
-  Item_equal(Item_field *f1, Item_field *f2);
-  Item_equal(Item *c, Item_field *f);
+  Item_equal(Item *f1, Item *f2, bool with_const_item);
   Item_equal(Item_equal *item_equal);
-  inline Item* get_const() { return const_item; }
-  void compare_const(Item *c);
-  void add(Item *c, Item_field *f);
-  void add(Item *c);
-  void add(Item_field *f);
-  uint members();
+  /* Currently the const item is always the first in the list of equal items */
+  inline Item* get_const() { return with_const ? equal_items.head() : NULL; }
+  void add_const(Item *c, Item *f = NULL);
+  /** Add a non-constant item to the multiple equality */
+  void add(Item *f) { equal_items.push_back(f); }
   bool contains(Field *field);
-  Item_field* get_first(Item_field *field);
-  uint n_fields() { return fields.elements; }
+  Item* get_first(Item *field);
+  /** Get number of field items / references to field items in this object */   
+  uint n_field_items() { return equal_items.elements-test(with_const); }
   void merge(Item_equal *item);
   void update_const();
   enum Functype functype() const { return MULT_EQUAL_FUNC; }
@@ -1645,18 +1709,18 @@ public:
   const char *func_name() const { return "multiple equal"; }
   optimize_type select_optimize() const { return OPTIMIZE_EQUAL; }
   void sort(Item_field_cmpfunc compare, void *arg);
-  friend class Item_equal_iterator;
   void fix_length_and_dec();
   bool fix_fields(THD *thd, Item **ref);
   void update_used_tables();
   bool walk(Item_processor processor, bool walk_subquery, uchar *arg);
   Item *transform(Item_transformer transformer, uchar *arg);
   virtual void print(String *str, enum_query_type query_type);
-  CHARSET_INFO *compare_collation() 
-  { return fields.head()->collation.collation; }
+  CHARSET_INFO *compare_collation();
+  friend class Item_equal_fields_iterator;
   friend Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
                            Item_equal *item_equal);
-  friend bool setup_sj_materialization(struct st_join_table *tab);
+  friend bool setup_sj_materialization_part1(struct st_join_table *tab);
+  friend bool setup_sj_materialization_part2(struct st_join_table *tab);
 }; 
 
 class COND_EQUAL: public Sql_alloc
@@ -1674,23 +1738,52 @@ public:
 };
 
 
-class Item_equal_iterator : public List_iterator_fast<Item_field>
+/* 
+  The class Item_equal_fields_iterator is used to iterate over references
+  to table/view columns from a list of equal items.
+*/ 
+
+class Item_equal_fields_iterator : public List_iterator_fast<Item>
 {
+  Item_equal *item_equal;
+  Item *curr_item;
 public:
-  inline Item_equal_iterator(Item_equal &item_equal) 
-    :List_iterator_fast<Item_field> (item_equal.fields)
-  {}
-  inline Item_field* operator++(int)
-  { 
-    Item_field *item= (*(List_iterator_fast<Item_field> *) this)++;
-    return  item;
+  Item_equal_fields_iterator(Item_equal &item_eq) 
+    :List_iterator_fast<Item> (item_eq.equal_items)
+  {
+    curr_item= NULL;
+    item_equal= &item_eq;
+    if (item_eq.with_const)
+    {
+      List_iterator_fast<Item> *list_it= this;
+      curr_item=  (*list_it)++;
+    }
   }
-  inline void rewind(void) 
+  Item* operator++(int)
   { 
-    List_iterator_fast<Item_field>::rewind();
+    List_iterator_fast<Item> *list_it= this;
+    curr_item= (*list_it)++;
+    return curr_item;
+  }
+  Item ** ref()
+  {
+    return List_iterator_fast<Item>::ref();
   }
+  void rewind(void) 
+  { 
+    List_iterator_fast<Item> *list_it= this;
+    list_it->rewind();
+    if (item_equal->with_const)
+      curr_item= (*list_it)++;
+  }  
+  Field *get_curr_field()
+  {
+    Item_field *item= (Item_field *) (curr_item->real_item());
+     return item->field;
+  }  
 };
 
+
 class Item_cond_and :public Item_cond
 {
 public:
@@ -1746,6 +1839,14 @@ public:
   Item *neg_transformer(THD *thd);
 };
 
+class Item_func_dyncol_exists :public Item_bool_func
+{
+public:
+  Item_func_dyncol_exists(Item *str, Item *num) :Item_bool_func(str, num) {}
+  longlong val_int();
+  const char *func_name() const { return "column_exists"; }
+};
+
 inline bool is_cond_or(Item *item)
 {
   if (item->type() != Item::COND_ITEM)
@@ -1755,45 +1856,6 @@ inline bool is_cond_or(Item *item)
   return (cond_item->functype() == Item_func::COND_OR_FUNC);
 }
 
-/*
-  XOR is Item_cond, not an Item_int_func because we could like to
-  optimize (a XOR b) later on. It's low prio, though
-*/
-
-class Item_cond_xor :public Item_cond
-{
-public:
-  Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) 
-  {
-    /* 
-      Items must be stored in args[] as well because this Item_cond is
-      treated as a FUNC_ITEM (see type()). I.e., users of it will get
-      it's children by calling arguments(), not argument_list(). This
-      is a temporary solution until XOR is optimized and treated like
-      a full Item_cond citizen.
-     */
-    arg_count= 2;
-    args= tmp_arg;
-    args[0]= i1; 
-    args[1]= i2;
-  }
-  enum Functype functype() const { return COND_XOR_FUNC; }
-  /* TODO: remove the next line when implementing XOR optimization */
-  enum Type type() const { return FUNC_ITEM; }
-  longlong val_int();
-  const char *func_name() const { return "xor"; }
-  void top_level_item() {}
-  /* Since child Items are stored in args[], Items cannot be added.
-     However, since Item_cond_xor is treated as a FUNC_ITEM (see
-     type()), the methods below should never be called. 
-  */
-  bool add(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
-  bool add_at_head(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
-  bool add_at_head(List<Item> *nlist) { DBUG_ASSERT(FALSE); return FALSE; }
-  void copy_andor_arguments(THD *thd, Item_cond *item) { DBUG_ASSERT(FALSE); }
-};
-
-
 /* Some useful inline functions */
 
 inline Item *and_conds(Item *a, Item *b)
@@ -1826,3 +1888,4 @@ extern Ge_creator ge_creator;
 extern Le_creator le_creator;
 
 #endif /* ITEM_CMPFUNC_INCLUDED */
+
diff --git a/sql/item_create.cc b/sql/item_create.cc
index bc3c904e5fd..afa78946fb8 100644
--- a/sql/item_create.cc
+++ b/sql/item_create.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010 Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -33,6 +34,69 @@
 
 /*
 =============================================================================
+  HELPER FUNCTIONS
+=============================================================================
+*/
+
+static const char* item_name(Item *a, String *str)
+{
+  if (a->name)
+    return a->name;
+  str->length(0);
+  a->print(str, QT_ORDINARY);
+  return str->c_ptr_safe();
+}
+
+
+static void wrong_precision_error(uint errcode, Item *a,
+                                  ulonglong number, ulong maximum)
+{
+  char buff[1024];
+  String buf(buff, sizeof(buff), system_charset_info);
+
+  my_error(errcode, MYF(0), (uint) min(number, UINT_MAX32),
+           item_name(a, &buf), maximum);
+}
+
+
+/**
+  Get precision and scale for a declaration
+ 
+  return
+    0  ok
+    1  error
+*/
+
+bool get_length_and_scale(ulonglong length, ulonglong decimals,
+                          ulong *out_length, uint *out_decimals,
+                          uint max_precision, uint max_scale,
+                          Item *a)
+{
+  if (length > (ulonglong) max_precision)
+  {
+    wrong_precision_error(ER_TOO_BIG_PRECISION, a, length, max_precision);
+    return 1;
+  }
+  if (decimals > (ulonglong) max_scale)
+  {
+    wrong_precision_error(ER_TOO_BIG_SCALE, a, decimals, max_scale);
+    return 1;
+  }
+
+  *out_length=  (ulong) length;
+  *out_decimals=  (uint) decimals;
+  my_decimal_trim(out_length, out_decimals);
+  
+  if (*out_length < *out_decimals)
+  {
+    my_error(ER_M_BIGGER_THAN_D, MYF(0), "");
+    return 1;
+  }
+  return 0;
+}
+
+/*
+=============================================================================
   LOCAL DECLARATIONS
 =============================================================================
 */
@@ -5186,6 +5250,17 @@ create_func_cast(THD *thd, Item *a, Cast_target cast_type,
                  CHARSET_INFO *cs)
 {
   Item *UNINIT_VAR(res);
+  ulonglong length= 0, decimals= 0;
+  int error;
+  
+  /*
+    We don't have to check for error here as sql_yacc.yy has guaranteed
+    that the values are in range of ulonglong
+  */
+  if (c_len)
+    length= (ulonglong) my_strtoll10(c_len, NULL, &error);
+  if (c_dec)
+    decimals= (ulonglong) my_strtoll10(c_dec, NULL, &error);
 
   switch (cast_type) {
   case ITEM_CAST_BINARY:
@@ -5201,62 +5276,50 @@ create_func_cast(THD *thd, Item *a, Cast_target cast_type,
     res= new (thd->mem_root) Item_date_typecast(a);
     break;
   case ITEM_CAST_TIME:
-    res= new (thd->mem_root) Item_time_typecast(a);
-    break;
-  case ITEM_CAST_DATETIME:
-    res= new (thd->mem_root) Item_datetime_typecast(a);
-    break;
-  case ITEM_CAST_DECIMAL:
-  {
-    ulong len= 0;
-    uint dec= 0;
-
-    if (c_len)
+    if (decimals > MAX_DATETIME_PRECISION)
     {
-      ulong decoded_size;
-      errno= 0;
-      decoded_size= strtoul(c_len, NULL, 10);
-      if (errno != 0)
-      {
-        my_error(ER_TOO_BIG_PRECISION, MYF(0), INT_MAX, a->name,
-                 static_cast<ulong>(DECIMAL_MAX_PRECISION));
-        return NULL;
-      }
-      len= decoded_size;
-    }
-
-    if (c_dec)
-    {
-      ulong decoded_size;
-      errno= 0;
-      decoded_size= strtoul(c_dec, NULL, 10);
-      if ((errno != 0) || (decoded_size > UINT_MAX))
-      {
-        my_error(ER_TOO_BIG_SCALE, MYF(0), INT_MAX, a->name,
-                 static_cast<ulong>(DECIMAL_MAX_SCALE));
-        return NULL;
-      }
-      dec= decoded_size;
-    }
-    my_decimal_trim(&len, &dec);
-    if (len < dec)
-    {
-      my_error(ER_M_BIGGER_THAN_D, MYF(0), "");
+      wrong_precision_error(ER_TOO_BIG_PRECISION, a, decimals,
+                            MAX_DATETIME_PRECISION);
       return 0;
     }
-    if (len > DECIMAL_MAX_PRECISION)
+    res= new (thd->mem_root) Item_time_typecast(a, (uint) decimals);
+    break;
+  case ITEM_CAST_DATETIME:
+    if (decimals > MAX_DATETIME_PRECISION)
     {
-      my_error(ER_TOO_BIG_PRECISION, MYF(0), static_cast<int>(len), a->name,
-               static_cast<ulong>(DECIMAL_MAX_PRECISION));
+      wrong_precision_error(ER_TOO_BIG_PRECISION, a, decimals,
+                            MAX_DATETIME_PRECISION);
       return 0;
     }
-    if (dec > DECIMAL_MAX_SCALE)
+    res= new (thd->mem_root) Item_datetime_typecast(a, (uint) decimals);
+    break;
+  case ITEM_CAST_DECIMAL:
+  {
+    ulong len;
+    uint dec;
+    if (get_length_and_scale(length, decimals, &len, &dec,
+                             DECIMAL_MAX_PRECISION, DECIMAL_MAX_SCALE,
+                             a))
+      return NULL;
+    res= new (thd->mem_root) Item_decimal_typecast(a, len, dec);
+    break;
+  }
+  case ITEM_CAST_DOUBLE:
+  {
+    ulong len;
+    uint dec;
+
+    if (!c_len)
     {
-      my_error(ER_TOO_BIG_SCALE, MYF(0), dec, a->name,
-               static_cast<ulong>(DECIMAL_MAX_SCALE));
-      return 0;
+      length=   DBL_DIG+7;
+      decimals= NOT_FIXED_DEC;
     }
-    res= new (thd->mem_root) Item_decimal_typecast(a, len, dec);
+    else if (get_length_and_scale(length, decimals, &len, &dec,
+                                  DECIMAL_MAX_PRECISION, NOT_FIXED_DEC-1,
+                                  a))
+      return NULL;
+    res= new (thd->mem_root) Item_double_typecast(a, (uint) length,
+                                                  (uint) decimals);
     break;
   }
   case ITEM_CAST_CHAR:
@@ -5265,15 +5328,15 @@ create_func_cast(THD *thd, Item *a, Cast_target cast_type,
     CHARSET_INFO *real_cs= (cs ? cs : thd->variables.collation_connection);
     if (c_len)
     {
-      ulong decoded_size;
-      errno= 0;
-      decoded_size= strtoul(c_len, NULL, 10);
-      if ((errno != 0) || (decoded_size > MAX_FIELD_BLOBLENGTH))
+      if (length > MAX_FIELD_BLOBLENGTH)
       {
-        my_error(ER_TOO_BIG_DISPLAYWIDTH, MYF(0), "cast as char", MAX_FIELD_BLOBLENGTH);
+        char buff[1024];
+        String buf(buff, sizeof(buff), system_charset_info);
+        my_error(ER_TOO_BIG_DISPLAYWIDTH, MYF(0), item_name(a, &buf),
+                 MAX_FIELD_BLOBLENGTH);
         return NULL;
       }
-      len= (int) decoded_size;
+      len= (int) length;
     }
     res= new (thd->mem_root) Item_char_typecast(a, len, real_cs);
     break;
@@ -5287,3 +5350,95 @@ create_func_cast(THD *thd, Item *a, Cast_target cast_type,
   }
   return res;
 }
+
+
+static List<Item> *create_func_dyncol_prepare(THD *thd,
+                                              DYNCALL_CREATE_DEF **dfs,
+                                              List<DYNCALL_CREATE_DEF> &list)
+{
+  DYNCALL_CREATE_DEF *def;
+  List_iterator_fast<DYNCALL_CREATE_DEF> li(list);
+  List<Item> *args= new (thd->mem_root) List<Item>;
+
+  *dfs= (DYNCALL_CREATE_DEF *)alloc_root(thd->mem_root,
+                                         sizeof(DYNCALL_CREATE_DEF) *
+                                         list.elements);
+
+  if (!args || !*dfs)
+    return NULL;
+
+  for (uint i= 0; (def= li++) ;)
+  {
+    dfs[0][i++]= *def;
+    args->push_back(def->num);
+    args->push_back(def->value);
+  }
+  return args;
+}
+
+Item *create_func_dyncol_create(THD *thd, List<DYNCALL_CREATE_DEF> &list)
+{
+  List<Item> *args;
+  DYNCALL_CREATE_DEF *dfs;
+  if (!(args= create_func_dyncol_prepare(thd, &dfs, list)))
+    return NULL;
+
+  return new (thd->mem_root) Item_func_dyncol_create(*args, dfs);
+}
+
+
+Item *create_func_dyncol_add(THD *thd, Item *str,
+                             List<DYNCALL_CREATE_DEF> &list)
+{
+  List<Item> *args;
+  DYNCALL_CREATE_DEF *dfs;
+
+  if (!(args= create_func_dyncol_prepare(thd, &dfs, list)))
+    return NULL;
+
+  args->push_back(str);
+
+  return new (thd->mem_root) Item_func_dyncol_add(*args, dfs);
+}
+
+
+
+Item *create_func_dyncol_delete(THD *thd, Item *str, List<Item> &nums)
+{
+  DYNCALL_CREATE_DEF *dfs;
+  Item *num;
+  List_iterator_fast<Item> it(nums);
+  List<Item> *args= new (thd->mem_root) List<Item>;
+
+  dfs= (DYNCALL_CREATE_DEF *)alloc_root(thd->mem_root,
+                                        sizeof(DYNCALL_CREATE_DEF) *
+                                        nums.elements);
+  if (!args || !dfs)
+    return NULL;
+
+  for (uint i= 0; (num= it++); i++)
+  {
+    dfs[i].num= num;
+    dfs[i].value= new Item_null();
+    dfs[i].type= DYN_COL_INT;
+    args->push_back(dfs[i].num);
+    args->push_back(dfs[i].value);
+  }
+
+  args->push_back(str);
+
+  return new (thd->mem_root) Item_func_dyncol_add(*args, dfs);
+}
+
+
+Item *create_func_dyncol_get(THD *thd,  Item *str, Item *num,
+                             Cast_target cast_type,
+                             const char *c_len, const char *c_dec,
+                             CHARSET_INFO *cs)
+{
+  Item *res;
+
+  if (!(res= new (thd->mem_root) Item_dyncol_get(str, num)))
+    return res;                                 // Return NULL
+  return create_func_cast(thd, res, cast_type, c_len, c_dec, cs);
+}
diff --git a/sql/item_create.h b/sql/item_create.h
index 3ef4b0efbf3..457879a618f 100644
--- a/sql/item_create.h
+++ b/sql/item_create.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010 Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -167,8 +167,17 @@ create_func_cast(THD *thd, Item *a, Cast_target cast_type,
                  const char *len, const char *dec,
                  CHARSET_INFO *cs);
 
+
 int item_create_init();
 void item_create_cleanup();
 
+Item *create_func_dyncol_create(THD *thd, List<DYNCALL_CREATE_DEF> &list);
+Item *create_func_dyncol_add(THD *thd, Item *str,
+                             List<DYNCALL_CREATE_DEF> &list);
+Item *create_func_dyncol_delete(THD *thd, Item *str, List<Item> &nums);
+Item *create_func_dyncol_get(THD *thd, Item *num, Item *str,
+                             Cast_target cast_type,
+                             const char *c_len, const char *c_dec,
+                             CHARSET_INFO *cs);
 #endif
 
diff --git a/sql/item_func.cc b/sql/item_func.cc
index 7eea131e648..bbfa3b74c5e 100644
--- a/sql/item_func.cc
+++ b/sql/item_func.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -38,7 +39,7 @@
 #include "strfunc.h"                            // find_type
 #include "sql_parse.h"                          // is_update_query
 #include "sql_acl.h"                            // EXECUTE_ACL
-#include "mysqld.h"                             // LOCK_uuid_generator
+#include "mysqld.h"                             // LOCK_short_uuid_generator
 #include "rpl_mi.h"
 #include <m_ctype.h>
 #include <hash.h>
@@ -103,6 +104,7 @@ void Item_func::set_arguments(List<Item> &list)
     {
       *(save_args++)= item;
       with_sum_func|=item->with_sum_func;
+      with_field|= item->with_field;
     }
   }
   list.empty();					// Fields are used
@@ -153,6 +155,7 @@ Item_func::Item_func(THD *thd, Item_func *item)
     Sets as a side effect the following class variables:
       maybe_null	Set if any argument may return NULL
       with_sum_func	Set if any of the arguments contains a sum function
+      with_field        Set if any of the arguments contains or is a field
       used_tables_cache Set to union of the tables used by arguments
 
       str_value.charset If this is a string function, set this to the
@@ -220,8 +223,8 @@ Item_func::fix_fields(THD *thd, Item **ref)
 	maybe_null=1;
 
       with_sum_func= with_sum_func || item->with_sum_func;
+      with_field= with_field || item->with_field;
       used_tables_cache|=     item->used_tables();
-      not_null_tables_cache|= item->not_null_tables();
       const_item_cache&=      item->const_item();
       with_subselect|=        item->with_subselect;
     }
@@ -234,6 +237,37 @@ Item_func::fix_fields(THD *thd, Item **ref)
   return FALSE;
 }
 
+void
+Item_func::quick_fix_field()
+{
+  Item **arg,**arg_end;
+  if (arg_count)
+  {
+    for (arg=args, arg_end=args+arg_count; arg != arg_end ; arg++)
+    {
+      if (!(*arg)->fixed)
+        (*arg)->quick_fix_field();
+    }
+  }
+  fixed= 1;
+}
+
+
+bool
+Item_func::eval_not_null_tables(uchar *opt_arg)
+{
+  Item **arg,**arg_end;
+  not_null_tables_cache= 0;
+  if (arg_count)
+  {		
+    for (arg=args, arg_end=args+arg_count; arg != arg_end ; arg++)
+    {
+      not_null_tables_cache|= (*arg)->not_null_tables();
+    }
+  }
+  return FALSE;
+}
+
 
 void Item_func::fix_after_pullout(st_select_lex *new_parent, Item **ref)
 {
@@ -356,6 +390,8 @@ Item *Item_func::transform(Item_transformer transformer, uchar *argument)
     the old item is substituted for a new one.
     After this the transformer is applied to the root node
     of the Item_func object. 
+    The compile function is not called if the analyzer returns NULL
+    in the parameter arg_p. 
 
   @param analyzer      the analyzer callback function to be applied to the
                        nodes of the tree of the object
@@ -373,7 +409,7 @@ Item *Item_func::compile(Item_analyzer analyzer, uchar **arg_p,
 {
   if (!(this->*analyzer)(arg_p))
     return 0;
-  if (arg_count)
+  if (*arg_p && arg_count)
   {
     Item **arg,**arg_end;
     for (arg= args, arg_end= args+arg_count; arg != arg_end; arg++)
@@ -381,7 +417,7 @@ Item *Item_func::compile(Item_analyzer analyzer, uchar **arg_p,
       /* 
         The same parameter value of arg_p must be passed
         to analyze any argument of the condition formula.
-      */   
+      */
       uchar *arg_v= *arg_p;
       Item *new_item= (*arg)->compile(analyzer, &arg_v, transformer, arg_t);
       if (new_item && *arg != new_item)
@@ -509,7 +545,8 @@ Field *Item_func::tmp_table_field(TABLE *table)
     field= Field_new_decimal::create_from_item(this);
     break;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     // This case should never be chosen
     DBUG_ASSERT(0);
     field= 0;
@@ -717,8 +754,8 @@ void Item_num_op::find_num_type(void)
   DBUG_ENTER("Item_num_op::find_num_type");
   DBUG_PRINT("info", ("name %s", func_name()));
   DBUG_ASSERT(arg_count == 2);
-  Item_result r0= args[0]->result_type();
-  Item_result r1= args[1]->result_type();
+  Item_result r0= args[0]->cast_to_int_type();
+  Item_result r1= args[1]->cast_to_int_type();
 
   if (r0 == REAL_RESULT || r1 == REAL_RESULT ||
       r0 == STRING_RESULT || r1 ==STRING_RESULT)
@@ -727,7 +764,8 @@ void Item_num_op::find_num_type(void)
     max_length= float_length(decimals);
     hybrid_type= REAL_RESULT;
   }
-  else if (r0 == DECIMAL_RESULT || r1 == DECIMAL_RESULT)
+  else if (r0 == DECIMAL_RESULT || r1 == DECIMAL_RESULT ||
+           r0 == TIME_RESULT || r1 == TIME_RESULT)
   {
     hybrid_type= DECIMAL_RESULT;
     result_precision();
@@ -758,7 +796,7 @@ void Item_func_num1::find_num_type()
 {
   DBUG_ENTER("Item_func_num1::find_num_type");
   DBUG_PRINT("info", ("name %s", func_name()));
-  switch (hybrid_type= args[0]->result_type()) {
+  switch (hybrid_type= args[0]->cast_to_int_type()) {
   case INT_RESULT:
     unsigned_flag= args[0]->unsigned_flag;
     break;
@@ -767,9 +805,12 @@ void Item_func_num1::find_num_type()
     hybrid_type= REAL_RESULT;
     max_length= float_length(decimals);
     break;
+  case TIME_RESULT:
+    hybrid_type= DECIMAL_RESULT;
   case DECIMAL_RESULT:
     break;
-  default:
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   DBUG_PRINT("info", ("Type: %s",
@@ -827,7 +868,9 @@ String *Item_func_numhybrid::val_str(String *str)
   }
   case STRING_RESULT:
     return str_op(&str_value);
-  default:
+  case TIME_RESULT:
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   return str;
@@ -862,7 +905,9 @@ double Item_func_numhybrid::val_real()
     return (res ? my_strntod(res->charset(), (char*) res->ptr(), res->length(),
 			     &end_not_used, &err_not_used) : 0.0);
   }
-  default:
+  case TIME_RESULT:
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   return 0.0;
@@ -897,7 +942,9 @@ longlong Item_func_numhybrid::val_int()
     CHARSET_INFO *cs= res->charset();
     return (*(cs->cset->strtoll10))(cs, res->ptr(), &end, &err_not_used);
   }
-  default:
+  case TIME_RESULT:
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   return 0;
@@ -935,7 +982,8 @@ my_decimal *Item_func_numhybrid::val_decimal(my_decimal *decimal_value)
     break;
   }  
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   return val;
@@ -996,21 +1044,32 @@ longlong Item_func_signed::val_int()
   longlong value;
   int error;
 
-  if (args[0]->cast_to_int_type() != STRING_RESULT ||
-      args[0]->result_as_longlong())
+  if (args[0]->cast_to_int_type() != STRING_RESULT)
+  {
+    value= args[0]->val_int();
+    null_value= args[0]->null_value; 
+    return value;
+  }
+  else if (args[0]->dynamic_result())
   {
+    /* We come here when argument has an unknown type */
+    args[0]->unsigned_flag= 0;   // Mark that we want to have a signed value
     value= args[0]->val_int();
     null_value= args[0]->null_value; 
+    if (!null_value && args[0]->unsigned_flag && value < 0)
+      goto err;                                 // Warn about overflow
     return value;
   }
 
   value= val_int_from_str(&error);
   if (value < 0 && error == 0)
-  {
-    push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
-                 "Cast to signed converted positive out-of-range integer to "
-                 "it's negative complement");
-  }
+    goto err;
+  return value;
+
+err:
+  push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
+               "Cast to signed converted positive out-of-range integer to "
+               "it's negative complement");
   return value;
 }
 
@@ -1038,19 +1097,35 @@ longlong Item_func_unsigned::val_int()
       value= 0;
     return value;
   }
-  else if (args[0]->cast_to_int_type() != STRING_RESULT ||
-           args[0]->result_as_longlong())
+  else if (args[0]->dynamic_result())
+  {
+    /* We come here when argument has an unknown type */
+    args[0]->unsigned_flag= 1;   // Mark that we want to have an unsigned value
+    value= args[0]->val_int();
+    null_value= args[0]->null_value; 
+    if (!null_value && args[0]->unsigned_flag == 0 && value < 0)
+      goto err;                                 // Warn about overflow
+    return value;
+  }
+  else if (args[0]->cast_to_int_type() != STRING_RESULT)
   {
     value= args[0]->val_int();
     null_value= args[0]->null_value; 
+    if (!null_value && args[0]->unsigned_flag == 0 && value < 0)
+      goto err;                                 // Warn about overflow
     return value;
   }
 
   value= val_int_from_str(&error);
   if (error < 0)
-    push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
-                 "Cast to unsigned converted negative integer to it's "
-                 "positive complement");
+    goto err;
+
+  return value;
+
+err:
+  push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR,
+               "Cast to unsigned converted negative integer to it's "
+               "positive complement");
   return value;
 }
 
@@ -1148,6 +1223,51 @@ void Item_decimal_typecast::print(String *str, enum_query_type query_type)
 }
 
 
+double Item_double_typecast::val_real()
+{
+  int error;
+  double tmp= args[0]->val_real();
+  if ((null_value= args[0]->null_value))
+    return 0.0;
+
+  if ((error= truncate_double(&tmp, max_length, decimals, 0, DBL_MAX)))
+  {
+    push_warning_printf(current_thd,
+                        MYSQL_ERROR::WARN_LEVEL_WARN,
+                        ER_WARN_DATA_OUT_OF_RANGE,
+                        ER(ER_WARN_DATA_OUT_OF_RANGE),
+                        name, 1);
+    if (error < 0)
+    {
+      null_value= 1;                            // Illegal value
+      tmp= 0.0;
+    }
+  }
+  return tmp;
+}
+
+
+void Item_double_typecast::print(String *str, enum_query_type query_type)
+{
+  char len_buf[20*3 + 1];
+  char *end;
+
+  str->append(STRING_WITH_LEN("cast("));
+  args[0]->print(str, query_type);
+  str->append(STRING_WITH_LEN(" as double"));
+  if (decimals != NOT_FIXED_DEC)
+  {
+    str->append('(');
+    end= int10_to_str(max_length, len_buf,10);
+    str->append(len_buf, (uint32) (end - len_buf));
+    str->append(',');
+    end= int10_to_str(decimals, len_buf,10);
+    str->append(len_buf, (uint32) (end - len_buf));
+    str->append(')');
+  }
+  str->append(')');
+}
+
 double Item_func_plus::real_op()
 {
   double value= args[0]->val_real() + args[1]->val_real();
@@ -1566,7 +1686,7 @@ void Item_func_div::fix_length_and_dec()
   DBUG_ENTER("Item_func_div::fix_length_and_dec");
   prec_increment= current_thd->variables.div_precincrement;
   Item_num_op::fix_length_and_dec();
-  switch(hybrid_type) {
+  switch (hybrid_type) {
   case REAL_RESULT:
   {
     decimals=max(args[0]->decimals,args[1]->decimals)+prec_increment;
@@ -1589,7 +1709,10 @@ void Item_func_div::fix_length_and_dec()
   case DECIMAL_RESULT:
     result_precision();
     break;
-  default:
+  case STRING_RESULT:
+  case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   maybe_null= 1; // devision by zero
@@ -2130,7 +2253,7 @@ void Item_func_int_val::find_num_type()
 {
   DBUG_ENTER("Item_func_int_val::find_num_type");
   DBUG_PRINT("info", ("name %s", func_name()));
-  switch(hybrid_type= args[0]->result_type())
+  switch (hybrid_type= args[0]->cast_to_int_type())
   {
   case STRING_RESULT:
   case REAL_RESULT:
@@ -2138,6 +2261,7 @@ void Item_func_int_val::find_num_type()
     max_length= float_length(decimals);
     break;
   case INT_RESULT:
+  case TIME_RESULT:
   case DECIMAL_RESULT:
     /*
       -2 because in most high position can't be used any digit for longlong
@@ -2154,7 +2278,8 @@ void Item_func_int_val::find_num_type()
       hybrid_type= INT_RESULT;
     }
     break;
-  default:
+  case ROW_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   DBUG_PRINT("info", ("Type: %s",
@@ -2283,7 +2408,7 @@ void Item_func_round::fix_length_and_dec()
   }
 
   val1= args[1]->val_int();
-  if ((null_value= args[1]->is_null()))
+  if ((null_value= args[1]->null_value))
     return;
 
   val1_unsigned= args[1]->unsigned_flag;
@@ -2333,7 +2458,9 @@ void Item_func_round::fix_length_and_dec()
                                                              unsigned_flag);
     break;
   }
-  default:
+  case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0); /* This result type isn't handled */
   }
 }
@@ -2541,10 +2668,10 @@ double Item_func_units::val_real()
 void Item_func_min_max::fix_length_and_dec()
 {
   int max_int_part=0;
-  bool datetime_found= FALSE;
   decimals=0;
   max_length=0;
   maybe_null=0;
+  thd= current_thd;
   cmp_type=args[0]->result_type();
 
   for (uint i=0 ; i < arg_count ; i++)
@@ -2553,25 +2680,12 @@ void Item_func_min_max::fix_length_and_dec()
     set_if_bigger(decimals, args[i]->decimals);
     set_if_bigger(max_int_part, args[i]->decimal_int_part());
     if (args[i]->maybe_null)
-      maybe_null=1;
-    cmp_type=item_cmp_type(cmp_type,args[i]->result_type());
-    if (args[i]->result_type() != ROW_RESULT && args[i]->is_datetime())
-    {
-      datetime_found= TRUE;
-      if (!datetime_item || args[i]->field_type() == MYSQL_TYPE_DATETIME)
-        datetime_item= args[i];
-    }
+      maybe_null= 1;
+    cmp_type= item_cmp_type(cmp_type,args[i]->result_type());
   }
   if (cmp_type == STRING_RESULT)
-  {
     agg_arg_charsets_for_string_result_with_comparison(collation,
                                                        args, arg_count);
-    if (datetime_found)
-    {
-      thd= current_thd;
-      compare_as_dates= TRUE;
-    }
-  }
   else if ((cmp_type == DECIMAL_RESULT) || (cmp_type == INT_RESULT))
   {
     collation.set_numeric();
@@ -2582,61 +2696,73 @@ void Item_func_min_max::fix_length_and_dec()
   }
   else if (cmp_type == REAL_RESULT)
     fix_char_length(float_length(decimals));
-  cached_field_type= agg_field_type(args, arg_count);
+
+  compare_as_dates= find_date_time_item(args, arg_count, 0);
+  if (compare_as_dates)
+  {
+    cached_field_type= compare_as_dates->field_type();
+    if (mysql_type_to_time_type(cached_field_type) == MYSQL_TIMESTAMP_DATE)
+      decimals= 0;
+    else
+      set_if_smaller(decimals, TIME_SECOND_PART_DIGITS);
+  }
+  else
+    cached_field_type= agg_field_type(args, arg_count);
 }
 
 
 /*
   Compare item arguments in the DATETIME context.
 
-  SYNOPSIS
-    cmp_datetimes()
-    value [out]   found least/greatest DATE/DATETIME value
-
   DESCRIPTION
     Compare item arguments as DATETIME values and return the index of the
     least/greatest argument in the arguments array.
-    The correct integer DATE/DATETIME value of the found argument is
+    The correct DATE/DATETIME value of the found argument is
     stored to the value pointer, if latter is provided.
 
   RETURN
-   0	If one of arguments is NULL or there was a execution error
-   #	index of the least/greatest argument
+   1	If one of arguments is NULL or there was a execution error
+   0    Otherwise
 */
 
-uint Item_func_min_max::cmp_datetimes(ulonglong *value)
+bool Item_func_min_max::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   longlong UNINIT_VAR(min_max);
-  uint min_max_idx= 0;
+  DBUG_ASSERT(fixed == 1);
+
+  /*
+    just like ::val_int() method of a string item can be called,
+    for example, SELECT CONCAT("10", "12") + 1,
+    ::get_date() can be called for non-temporal values,
+    for example, SELECT MONTH(GREATEST("2011-11-21", "2010-10-09"))
+
+  */
+  if (!compare_as_dates)
+    return Item_func::get_date(ltime, fuzzy_date);
 
   for (uint i=0; i < arg_count ; i++)
   {
     Item **arg= args + i;
     bool is_null;
-    longlong res= get_datetime_value(thd, &arg, 0, datetime_item, &is_null);
+    longlong res= get_datetime_value(thd, &arg, 0, compare_as_dates, &is_null);
 
-    /* Check if we need to stop (because of error or KILL)  and stop the loop */
-    if (thd->is_error())
+    /* Check if we need to stop (because of error or KILL) and stop the loop */
+    if (thd->is_error() || args[i]->null_value)
     {
-      null_value= 1;
-      return 0;
+      return (null_value= 1);
     }
 
-    if ((null_value= args[i]->null_value))
-      return 0;
     if (i == 0 || (res < min_max ? cmp_sign : -cmp_sign) > 0)
-    {
       min_max= res;
-      min_max_idx= i;
-    }
   }
-  if (value)
+  unpack_time(min_max, ltime);
+  if (compare_as_dates->field_type() == MYSQL_TYPE_DATE)
   {
-    *value= min_max;
-    if (datetime_item->field_type() == MYSQL_TYPE_DATE)
-      *value/= 1000000L;
+    ltime->time_type= MYSQL_TIMESTAMP_DATE;
+    ltime->hour= ltime->minute= ltime->second= ltime->second_part= 0;
   }
-  return min_max_idx;
+
+  return (null_value= 0);
 }
 
 
@@ -2644,46 +2770,14 @@ String *Item_func_min_max::val_str(String *str)
 {
   DBUG_ASSERT(fixed == 1);
   if (compare_as_dates)
-  {
-    String *str_res;
-    uint min_max_idx= cmp_datetimes(NULL);
-    if (null_value)
-      return 0;
-    str_res= args[min_max_idx]->val_str(str);
-    if (args[min_max_idx]->null_value)
-    {
-      // check if the call to val_str() above returns a NULL value
-      null_value= 1;
-      return NULL;
-    }
-    str_res->set_charset(collation.collation);
-    return str_res;
-  }
+    return val_string_from_date(str);
   switch (cmp_type) {
   case INT_RESULT:
-  {
-    longlong nr=val_int();
-    if (null_value)
-      return 0;
-    str->set_int(nr, unsigned_flag, collation.collation);
-    return str;
-  }
+    return val_string_from_int(str);
   case DECIMAL_RESULT:
-  {
-    my_decimal dec_buf, *dec_val= val_decimal(&dec_buf);
-    if (null_value)
-      return 0;
-    my_decimal2string(E_DEC_FATAL_ERROR, dec_val, 0, 0, 0, str);
-    return str;
-  }
+    return val_string_from_decimal(str);
   case REAL_RESULT:
-  {
-    double nr= val_real();
-    if (null_value)
-      return 0; /* purecov: inspected */
-    str->set_real(nr, decimals, collation.collation);
-    return str;
-  }
+    return val_string_from_real(str);
   case STRING_RESULT:
   {
     String *UNINIT_VAR(res);
@@ -2709,9 +2803,9 @@ String *Item_func_min_max::val_str(String *str)
     return res;
   }
   case ROW_RESULT:
-  default:
-    // This case should never be chosen
-    DBUG_ASSERT(0);
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
+    DBUG_ASSERT(0);                // This case should never be chosen
     return 0;
   }
   return 0;					// Keep compiler happy
@@ -2724,9 +2818,11 @@ double Item_func_min_max::val_real()
   double value=0.0;
   if (compare_as_dates)
   {
-    ulonglong result= 0;
-    (void)cmp_datetimes(&result);
-    return (double)result;
+    MYSQL_TIME ltime;
+    if (get_date(&ltime, TIME_FUZZY_DATE))
+      return 0;
+
+    return TIME_to_double(&ltime);
   }
   for (uint i=0; i < arg_count ; i++)
   {
@@ -2751,9 +2847,11 @@ longlong Item_func_min_max::val_int()
   longlong value=0;
   if (compare_as_dates)
   {
-    ulonglong result= 0;
-    (void)cmp_datetimes(&result);
-    return (longlong)result;
+    MYSQL_TIME ltime;
+    if (get_date(&ltime, TIME_FUZZY_DATE))
+      return 0;
+
+    return TIME_to_ulonglong(&ltime);
   }
   for (uint i=0; i < arg_count ; i++)
   {
@@ -2779,10 +2877,11 @@ my_decimal *Item_func_min_max::val_decimal(my_decimal *dec)
 
   if (compare_as_dates)
   {
-    ulonglong value= 0;
-    (void)cmp_datetimes(&value);
-    ulonglong2decimal(value, dec);
-    return dec;
+    MYSQL_TIME ltime;
+    if (get_date(&ltime, TIME_FUZZY_DATE))
+      return 0;
+
+    return date2my_decimal(&ltime, dec);
   }
   for (uint i=0; i < arg_count ; i++)
   {
@@ -3230,6 +3329,7 @@ udf_handler::fix_fields(THD *thd, Item_result_field *func,
       if (item->maybe_null)
 	func->maybe_null=1;
       func->with_sum_func= func->with_sum_func || item->with_sum_func;
+      func->with_field= func->with_field || item->with_field;
       used_tables_cache|=item->used_tables();
       const_item_cache&=item->const_item();
       f_args.arg_type[i]=item->result_type();
@@ -3275,8 +3375,7 @@ udf_handler::fix_fields(THD *thd, Item_result_field *func,
 
       if (arguments[i]->const_item())
       {
-        switch (arguments[i]->result_type()) 
-        {
+        switch (arguments[i]->result_type()) {
         case STRING_RESULT:
         case DECIMAL_RESULT:
         {
@@ -3302,9 +3401,9 @@ udf_handler::fix_fields(THD *thd, Item_result_field *func,
           to+= ALIGN_SIZE(sizeof(double));
           break;
         case ROW_RESULT:
-        default:
-          // This case should never be chosen
-          DBUG_ASSERT(0);
+        case TIME_RESULT:
+        case IMPOSSIBLE_RESULT:
+          DBUG_ASSERT(0);          // This case should never be chosen
           break;
         }
       }
@@ -3377,9 +3476,9 @@ bool udf_handler::get_arguments()
       }
       break;
     case ROW_RESULT:
-    default:
-      // This case should never be chosen
-      DBUG_ASSERT(0);
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
+      DBUG_ASSERT(0);              // This case should never be chosen
       break;
     }
   }
@@ -4112,9 +4211,9 @@ longlong Item_func_benchmark::val_int()
       (void) args[1]->val_decimal(&tmp_decimal);
       break;
     case ROW_RESULT:
-    default:
-      // This case should never be chosen
-      DBUG_ASSERT(0);
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
+      DBUG_ASSERT(0);              // This case should never be chosen
       return 0;
     }
   }
@@ -4297,6 +4396,21 @@ bool Item_func_set_user_var::fix_fields(THD *thd, Item **ref)
                          DERIVATION_IMPLICIT);
   collation.set(entry->collation.collation, DERIVATION_IMPLICIT);
   cached_result_type= args[0]->result_type();
+  if (thd->lex->current_select)
+  {
+    /*
+      When this function is used in a derived table/view force the derived
+      table to be materialized to preserve possible side-effect of setting a
+      user variable.
+    */
+    SELECT_LEX_UNIT *unit= thd->lex->current_select->master_unit();
+    TABLE_LIST *derived;
+    for (derived= unit->derived;
+         derived;
+         derived= derived->select_lex->master_unit()->derived)
+      derived->set_materialized_derived();
+  }
+
   return FALSE;
 }
 
@@ -4481,6 +4595,8 @@ double user_var_entry::val_real(bool *null_value)
   case STRING_RESULT:
     return my_atof(value);                      // This is null terminated
   case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);				// Impossible
     break;
   }
@@ -4512,6 +4628,8 @@ longlong user_var_entry::val_int(bool *null_value) const
     return my_strtoll10(value, (char**) 0, &error);// String is null terminated
   }
   case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);				// Impossible
     break;
   }
@@ -4545,6 +4663,8 @@ String *user_var_entry::val_str(bool *null_value, String *str,
       str= 0;					// EOM error
     break;
   case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);				// Impossible
     break;
   }
@@ -4572,6 +4692,8 @@ my_decimal *user_var_entry::val_decimal(bool *null_value, my_decimal *val)
     str2my_decimal(E_DEC_FATAL_ERROR, value, length, collation.collation, val);
     break;
   case ROW_RESULT:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);				// Impossible
     break;
   }
@@ -4610,8 +4732,9 @@ Item_func_set_user_var::check(bool use_result_field)
   {
     save_result.vint= use_result_field ? result_field->val_int() :
                        args[0]->val_int();
-    unsigned_flag= use_result_field ? ((Field_num*)result_field)->unsigned_flag:
-                    args[0]->unsigned_flag;
+    unsigned_flag= (use_result_field ?
+                    ((Field_num*)result_field)->unsigned_flag:
+                    args[0]->unsigned_flag);
     break;
   }
   case STRING_RESULT:
@@ -4628,9 +4751,9 @@ Item_func_set_user_var::check(bool use_result_field)
     break;
   }
   case ROW_RESULT:
-  default:
-    // This case should never be chosen
-    DBUG_ASSERT(0);
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
+    DBUG_ASSERT(0);                // This case should never be chosen
     break;
   }
   DBUG_RETURN(FALSE);
@@ -4663,9 +4786,9 @@ void Item_func_set_user_var::save_item_result(Item *item)
     save_result.vdec= item->val_decimal_result(&decimal_buff);
     break;
   case ROW_RESULT:
-  default:
-    // Should never happen
-    DBUG_ASSERT(0);
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
+    DBUG_ASSERT(0);                // This case should never be chosen
     break;
   }
   DBUG_VOID_RETURN;
@@ -4731,9 +4854,9 @@ Item_func_set_user_var::update()
     break;
   }
   case ROW_RESULT:
-  default:
-    // This case should never be chosen
-    DBUG_ASSERT(0);
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
+    DBUG_ASSERT(0);                // This case should never be chosen
     break;
   }
   DBUG_RETURN(res);
@@ -5167,7 +5290,7 @@ void Item_func_get_user_var::fix_length_and_dec()
     max_length= var_entry->length;
 
     collation.set(var_entry->collation);
-    switch(m_cached_result_type) {
+    switch (m_cached_result_type) {
     case REAL_RESULT:
       fix_char_length(DBL_DIG + 8);
       break;
@@ -5183,8 +5306,9 @@ void Item_func_get_user_var::fix_length_and_dec()
       decimals= DECIMAL_MAX_SCALE;
       break;
     case ROW_RESULT:                            // Keep compiler happy
-    default:
-      DBUG_ASSERT(0);
+    case TIME_RESULT:
+    case IMPOSSIBLE_RESULT:
+      DBUG_ASSERT(0);                // This case should never be chosen
       break;
     }
   }
@@ -6261,7 +6385,7 @@ Item_func_sp::cleanup()
     sp_result_field= NULL;
   }
   m_sp= NULL;
-  dummy_table->alias= NULL;
+  dummy_table->alias.free();
   Item_func::cleanup();
 }
 
@@ -6287,7 +6411,7 @@ Item_func_sp::func_name() const
     qname.append('.');
   }
   append_identifier(thd, &qname, m_name->m_name.str, m_name->m_name.length);
-  return qname.ptr();
+  return qname.c_ptr_safe();
 }
 
 
@@ -6343,7 +6467,7 @@ Item_func_sp::init_result_field(THD *thd)
    */
   
   share= dummy_table->s;
-  dummy_table->alias = "";
+  dummy_table->alias.set("", 0, table_alias_charset);
   dummy_table->maybe_null = maybe_null;
   dummy_table->in_use= thd;
   dummy_table->copy_blobs= TRUE;
@@ -6665,8 +6789,8 @@ void uuid_short_init()
 longlong Item_func_uuid_short::val_int()
 {
   ulonglong val;
-  mysql_mutex_lock(&LOCK_uuid_generator);
+  mysql_mutex_lock(&LOCK_short_uuid_generator);
   val= uuid_value++;
-  mysql_mutex_unlock(&LOCK_uuid_generator);
+  mysql_mutex_unlock(&LOCK_short_uuid_generator);
   return (longlong) val;
 }
diff --git a/sql/item_func.h b/sql/item_func.h
index c3cb3e3ba02..5477c27b13d 100644
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@ -1,7 +1,7 @@
 #ifndef ITEM_FUNC_INCLUDED
 #define ITEM_FUNC_INCLUDED
-
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -46,7 +46,7 @@ public:
   enum Functype { UNKNOWN_FUNC,EQ_FUNC,EQUAL_FUNC,NE_FUNC,LT_FUNC,LE_FUNC,
 		  GE_FUNC,GT_FUNC,FT_FUNC,
 		  LIKE_FUNC,ISNULL_FUNC,ISNOTNULL_FUNC,
-		  COND_AND_FUNC, COND_OR_FUNC, COND_XOR_FUNC,
+		  COND_AND_FUNC, COND_OR_FUNC, XOR_FUNC,
                   BETWEEN, IN_FUNC, MULT_EQUAL_FUNC,
 		  INTERVAL_FUNC, ISNOTNULLTEST_FUNC,
 		  SP_EQUALS_FUNC, SP_DISJOINT_FUNC,SP_INTERSECTS_FUNC,
@@ -67,6 +67,7 @@ public:
     allowed_arg_cols(1), arg_count(0)
   {
     with_sum_func= 0;
+    with_field= 0;
   }
   Item_func(Item *a):
     allowed_arg_cols(1), arg_count(1)
@@ -74,6 +75,7 @@ public:
     args= tmp_arg;
     args[0]= a;
     with_sum_func= a->with_sum_func;
+    with_field= a->with_field;
   }
   Item_func(Item *a,Item *b):
     allowed_arg_cols(1), arg_count(2)
@@ -81,6 +83,7 @@ public:
     args= tmp_arg;
     args[0]= a; args[1]= b;
     with_sum_func= a->with_sum_func || b->with_sum_func;
+    with_field= a->with_field || b->with_field;
   }
   Item_func(Item *a,Item *b,Item *c):
     allowed_arg_cols(1)
@@ -91,6 +94,7 @@ public:
       arg_count= 3;
       args[0]= a; args[1]= b; args[2]= c;
       with_sum_func= a->with_sum_func || b->with_sum_func || c->with_sum_func;
+      with_field= a->with_field || b->with_field || c->with_field;
     }
   }
   Item_func(Item *a,Item *b,Item *c,Item *d):
@@ -103,6 +107,8 @@ public:
       args[0]= a; args[1]= b; args[2]= c; args[3]= d;
       with_sum_func= a->with_sum_func || b->with_sum_func ||
 	c->with_sum_func || d->with_sum_func;
+      with_field= a->with_field || b->with_field ||
+        c->with_field || d->with_field;
     }
   }
   Item_func(Item *a,Item *b,Item *c,Item *d,Item* e):
@@ -114,6 +120,8 @@ public:
       args[0]= a; args[1]= b; args[2]= c; args[3]= d; args[4]= e;
       with_sum_func= a->with_sum_func || b->with_sum_func ||
 	c->with_sum_func || d->with_sum_func || e->with_sum_func ;
+      with_field= a->with_field || b->with_field ||
+        c->with_field || d->with_field || e->with_field;
     }
   }
   Item_func(List<Item> &list);
@@ -121,6 +129,7 @@ public:
   Item_func(THD *thd, Item_func *item);
   bool fix_fields(THD *, Item **ref);
   void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+  void quick_fix_field();
   table_map used_tables() const;
   table_map not_null_tables() const;
   void update_used_tables();
@@ -211,6 +220,7 @@ public:
                 Item_transformer transformer, uchar *arg_t);
   void traverse_cond(Cond_traverser traverser,
                      void * arg, traverse_order order);
+  bool eval_not_null_tables(uchar *opt_arg);
  // bool is_expensive_processor(uchar *arg);
  // virtual bool is_expensive() { return 0; }
   inline void raise_numeric_overflow(const char *type_name)
@@ -315,6 +325,21 @@ public:
   }
 
   /*
+    By default only substitution for a field whose two different values
+    are never equal is allowed in the arguments of a function.
+    This is overruled for the direct arguments of comparison functions.
+  */ 
+  bool subst_argument_checker(uchar **arg) 
+  { 
+    if (*arg)
+    {
+      *arg= (uchar *) Item::IDENTITY_SUBST;
+      return TRUE;
+    }
+    return FALSE;
+  }
+
+  /*
     We assume the result of any function that has a TIMESTAMP argument to be
     timezone-dependent, since a TIMESTAMP value in both numeric and string
     contexts is interpreted according to the current timezone.
@@ -371,6 +396,8 @@ class Item_func_numhybrid: public Item_func
 protected:
   Item_result hybrid_type;
 public:
+  Item_func_numhybrid() :Item_func(), hybrid_type(REAL_RESULT)
+  {}
   Item_func_numhybrid(Item *a) :Item_func(a), hybrid_type(REAL_RESULT)
   { collation.set_numeric(); }
   Item_func_numhybrid(Item *a,Item *b)
@@ -530,7 +557,7 @@ class Item_decimal_typecast :public Item_func
 public:
   Item_decimal_typecast(Item *a, int len, int dec) :Item_func(a)
   {
-    decimals= dec;
+    decimals= (uint8) dec;
     collation.set_numeric();
     fix_char_length(my_decimal_precision_to_length_no_truncation(len, dec,
                                                                  unsigned_flag));
@@ -541,12 +568,29 @@ public:
   my_decimal *val_decimal(my_decimal*);
   enum Item_result result_type () const { return DECIMAL_RESULT; }
   enum_field_types field_type() const { return MYSQL_TYPE_NEWDECIMAL; }
-  void fix_length_and_dec() {};
+  void fix_length_and_dec() {}
   const char *func_name() const { return "decimal_typecast"; }
   virtual void print(String *str, enum_query_type query_type);
 };
 
 
+class Item_double_typecast :public Item_real_func
+{
+public:
+  Item_double_typecast(Item *a, int len, int dec) :Item_real_func(a)
+  {
+    decimals=   (uint8)  dec;
+    max_length= (uint32) len;
+  }
+  double val_real();
+  enum_field_types field_type() const { return MYSQL_TYPE_DOUBLE; }
+  void fix_length_and_dec() { maybe_null= 1; }
+  const char *func_name() const { return "double_typecast"; }
+  virtual void print(String *str, enum_query_type query_type);
+};
+
+
+
 class Item_func_additive_op :public Item_num_op
 {
 public:
@@ -914,25 +958,21 @@ class Item_func_min_max :public Item_func
   Item_result cmp_type;
   String tmp_value;
   int cmp_sign;
-  /* TRUE <=> arguments should be compared in the DATETIME context. */
-  bool compare_as_dates;
   /* An item used for issuing warnings while string to DATETIME conversion. */
-  Item *datetime_item;
+  Item *compare_as_dates;
   THD *thd;
 protected:
   enum_field_types cached_field_type;
 public:
   Item_func_min_max(List<Item> &list,int cmp_sign_arg) :Item_func(list),
-    cmp_type(INT_RESULT), cmp_sign(cmp_sign_arg), compare_as_dates(FALSE),
-    datetime_item(0) {}
+    cmp_type(INT_RESULT), cmp_sign(cmp_sign_arg), compare_as_dates(0) {}
   double val_real();
   longlong val_int();
   String *val_str(String *);
   my_decimal *val_decimal(my_decimal *);
+  bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   void fix_length_and_dec();
   enum Item_result result_type () const { return cmp_type; }
-  bool result_as_longlong() { return compare_as_dates; };
-  uint cmp_datetimes(ulonglong *value);
   enum_field_types field_type() const { return cached_field_type; }
 };
 
@@ -1762,14 +1802,7 @@ public:
   void fix_length_and_dec() { decimals=0; max_length=1; maybe_null=1;}
   bool check_vcol_func_processor(uchar *int_arg) 
   {
-#if 0
-    DBUG_ENTER("Item_func_is_free_lock::check_vcol_func_processor");
-    DBUG_PRINT("info",
-      ("check_vcol_func_processor returns TRUE: unsupported function"));
-    DBUG_RETURN(TRUE);
-#else
     return trace_unsupported_by_check_vcol_func_processor(func_name());
-#endif
   }
 };
 
@@ -1793,7 +1826,7 @@ enum Cast_target
 {
   ITEM_CAST_BINARY, ITEM_CAST_SIGNED_INT, ITEM_CAST_UNSIGNED_INT,
   ITEM_CAST_DATE, ITEM_CAST_TIME, ITEM_CAST_DATETIME, ITEM_CAST_CHAR,
-  ITEM_CAST_DECIMAL
+  ITEM_CAST_DECIMAL, ITEM_CAST_DOUBLE
 };
 
 
@@ -1962,6 +1995,7 @@ Item *get_system_var(THD *thd, enum_var_type var_type, LEX_STRING name,
                      LEX_STRING component);
 extern bool check_reserved_words(LEX_STRING *name);
 extern enum_field_types agg_field_type(Item **items, uint nitems);
+Item *find_date_time_item(Item **args, uint nargs, uint col);
 double my_double_round(double value, longlong dec, bool dec_unsigned,
                        bool truncate);
 bool eval_const_cond(COND *cond);
diff --git a/sql/item_row.cc b/sql/item_row.cc
index 9fae85518af..eb446768423 100644
--- a/sql/item_row.cc
+++ b/sql/item_row.cc
@@ -93,12 +93,29 @@ bool Item_row::fix_fields(THD *thd, Item **ref)
     }
     maybe_null|= item->maybe_null;
     with_sum_func= with_sum_func || item->with_sum_func;
+    with_field= with_field || item->with_field;
   }
   fixed= 1;
   return FALSE;
 }
 
 
+bool
+Item_row::eval_not_null_tables(uchar *opt_arg)
+{
+  Item **arg,**arg_end;
+  not_null_tables_cache= 0;
+  if (arg_count)
+  {		
+    for (arg= items, arg_end= items+arg_count; arg != arg_end ; arg++)
+    {
+      not_null_tables_cache|= (*arg)->not_null_tables();
+    }
+  }
+  return FALSE;
+}
+
+
 void Item_row::cleanup()
 {
   DBUG_ENTER("Item_row::cleanup");
diff --git a/sql/item_row.h b/sql/item_row.h
index 63028e3bd7d..2141213b194 100644
--- a/sql/item_row.h
+++ b/sql/item_row.h
@@ -69,12 +69,14 @@ public:
   table_map used_tables() const { return used_tables_cache; };
   bool const_item() const { return const_item_cache; };
   enum Item_result result_type() const { return ROW_RESULT; }
+  Item_result cmp_type() const { return ROW_RESULT; }
   void update_used_tables();
   table_map not_null_tables() const { return not_null_tables_cache; }
   virtual void print(String *str, enum_query_type query_type);
 
   bool walk(Item_processor processor, bool walk_subquery, uchar *arg);
   Item *transform(Item_transformer transformer, uchar *arg);
+  bool eval_not_null_tables(uchar *opt_arg);
 
   uint cols() { return arg_count; }
   Item* element_index(uint i) { return items[i]; }
diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc
index b0fa090afcf..a0e5b42767f 100644
--- a/sql/item_strfunc.cc
+++ b/sql/item_strfunc.cc
@@ -41,6 +41,8 @@
 */
 #include "sql_class.h"                          // set_var.h: THD
 #include "set_var.h"
+#include "sql_base.h"
+#include "sql_time.h"
 #include "sql_acl.h"                            // SUPER_ACL
 #include "des_key_file.h"       // st_des_keyschedule, st_des_keyblock
 #include "password.h"           // my_make_scrambled_password,
@@ -171,7 +173,6 @@ String *Item_func_md5::val_str_ascii(String *str)
 {
   DBUG_ASSERT(fixed == 1);
   String * sptr= args[0]->val_str(str);
-  str->set_charset(&my_charset_bin);
   if (sptr)
   {
     uchar digest[16];
@@ -184,6 +185,7 @@ String *Item_func_md5::val_str_ascii(String *str)
       return 0;
     }
     array_to_hex((char *) str->ptr(), digest, 16);
+    str->set_charset(&my_charset_numeric);
     str->length((uint) 32);
     return str;
   }
@@ -210,7 +212,6 @@ String *Item_func_sha::val_str_ascii(String *str)
 {
   DBUG_ASSERT(fixed == 1);
   String * sptr= args[0]->val_str(str);
-  str->set_charset(&my_charset_bin);
   if (sptr)  /* If we got value different from NULL */
   {
     SHA1_CONTEXT context;  /* Context used to generate SHA1 hash */
@@ -220,11 +221,13 @@ String *Item_func_sha::val_str_ascii(String *str)
     /* No need to check error as the only case would be too long message */
     mysql_sha1_input(&context,
                      (const uchar *) sptr->ptr(), sptr->length());
+
     /* Ensure that memory is free and we got result */
     if (!( str->alloc(SHA1_HASH_SIZE*2) ||
            (mysql_sha1_result(&context,digest))))
     {
       array_to_hex((char *) str->ptr(), digest, SHA1_HASH_SIZE);
+      str->set_charset(&my_charset_numeric);
       str->length((uint)  SHA1_HASH_SIZE*2);
       null_value=0;
       return str;
@@ -824,7 +827,7 @@ String *Item_func_concat_ws::val_str(String *str)
 
   use_as_buff= &tmp_value;
   str->length(0);				// QQ; Should be removed
-  res=str;
+  res=str;                                      // If 0 arg_count
 
   // Skip until non-null argument is found.
   // If not, return the empty string
@@ -2468,6 +2471,7 @@ void Item_func_make_set::fix_length_and_dec()
   not_null_tables_cache&= item->not_null_tables();
   const_item_cache&=	  item->const_item();
   with_sum_func= with_sum_func || item->with_sum_func;
+  with_field= with_field || item->with_field;
 }
 
 
@@ -3257,7 +3261,7 @@ String *Item_load_file::val_str(String *str)
 			func_name(), current_thd->variables.max_allowed_packet);
     goto err;
   }
-  if (tmp_value.alloc(stat_info.st_size))
+  if (tmp_value.alloc((size_t)stat_info.st_size))
     goto err;
   if ((file= mysql_file_open(key_file_loadfile,
                              file_name->ptr(), O_RDONLY, MYF(0))) < 0)
@@ -3268,7 +3272,7 @@ String *Item_load_file::val_str(String *str)
     mysql_file_close(file, MYF(0));
     goto err;
   }
-  tmp_value.length(stat_info.st_size);
+  tmp_value.length((uint32)stat_info.st_size);
   mysql_file_close(file, MYF(0));
   null_value = 0;
   DBUG_RETURN(&tmp_value);
@@ -3739,3 +3743,769 @@ String *Item_func_uuid::val_str(String *str)
 
   return str;
 }
+
+
+Item_func_dyncol_create::Item_func_dyncol_create(List<Item> &args,
+                                                 DYNCALL_CREATE_DEF *dfs)
+  : Item_str_func(args), defs(dfs), vals(0), nums(0)
+{
+  DBUG_ASSERT((args.elements & 0x1) == 0); // even number of arguments
+}
+
+
+bool Item_func_dyncol_create::fix_fields(THD *thd, Item **ref)
+{
+  bool res= Item_func::fix_fields(thd, ref); // no need Item_str_func here
+  vals= (DYNAMIC_COLUMN_VALUE *) alloc_root(thd->mem_root,
+                                            sizeof(DYNAMIC_COLUMN_VALUE) *
+                                            (arg_count / 2));
+  nums= (uint *) alloc_root(thd->mem_root,
+                            sizeof(uint) * (arg_count / 2));
+  return res || vals == 0 || nums == 0;
+}
+
+
+void Item_func_dyncol_create::fix_length_and_dec()
+{
+  maybe_null= TRUE;
+  collation.set(&my_charset_bin);
+  decimals= 0;
+}
+
+void Item_func_dyncol_create::prepare_arguments()
+{
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String *res, tmp(buff, sizeof(buff), &my_charset_bin);
+  uint column_count= (arg_count / 2);
+  uint i;
+  my_decimal dtmp, *dres;
+
+  /* get values */
+  for (i= 0; i < column_count; i++)
+  {
+    uint valpos= i * 2 + 1;
+    DYNAMIC_COLUMN_TYPE type= defs[i].type;
+    if (type == DYN_COL_NULL) // auto detect
+    {
+      /*
+        We don't have a default here to ensure we get a warning if
+        one adds a new not handled MYSQL_TYPE_...
+      */
+      switch (args[valpos]->field_type()) {
+      case MYSQL_TYPE_DECIMAL:
+      case MYSQL_TYPE_NEWDECIMAL:
+        type= DYN_COL_DECIMAL;
+        break;
+      case MYSQL_TYPE_TINY:
+      case MYSQL_TYPE_SHORT:
+      case MYSQL_TYPE_LONG:
+      case MYSQL_TYPE_LONGLONG:
+      case MYSQL_TYPE_INT24:
+      case MYSQL_TYPE_YEAR:
+      case MYSQL_TYPE_BIT:
+        type= args[valpos]->unsigned_flag ? DYN_COL_UINT : DYN_COL_INT;
+        break;
+      case MYSQL_TYPE_FLOAT:
+      case MYSQL_TYPE_DOUBLE:
+        type= DYN_COL_DOUBLE;
+        break;
+      case MYSQL_TYPE_NULL:
+        type= DYN_COL_NULL;
+        break;
+      case MYSQL_TYPE_TIMESTAMP:
+      case MYSQL_TYPE_DATETIME:
+        type= DYN_COL_DATETIME;
+	break;
+      case MYSQL_TYPE_DATE:
+      case MYSQL_TYPE_NEWDATE:
+        type= DYN_COL_DATE;
+        break;
+      case MYSQL_TYPE_TIME:
+        type= DYN_COL_TIME;
+        break;
+      case MYSQL_TYPE_VARCHAR:
+      case MYSQL_TYPE_ENUM:
+      case MYSQL_TYPE_SET:
+      case MYSQL_TYPE_TINY_BLOB:
+      case MYSQL_TYPE_MEDIUM_BLOB:
+      case MYSQL_TYPE_LONG_BLOB:
+      case MYSQL_TYPE_BLOB:
+      case MYSQL_TYPE_VAR_STRING:
+      case MYSQL_TYPE_STRING:
+      case MYSQL_TYPE_GEOMETRY:
+        type= DYN_COL_STRING;
+        break;
+      }
+    }
+    nums[i]= (uint) args[i * 2]->val_int();
+    vals[i].type= type;
+    switch (type) {
+    case DYN_COL_NULL:
+      DBUG_ASSERT(args[valpos]->field_type() == MYSQL_TYPE_NULL);
+      break;
+    case DYN_COL_INT:
+      vals[i].long_value= args[valpos]->val_int();
+      break;
+    case DYN_COL_UINT:
+      vals[i].ulong_value= args[valpos]->val_int();
+      break;
+    case DYN_COL_DOUBLE:
+      vals[i].double_value= args[valpos]->val_real();
+      break;
+    case DYN_COL_STRING:
+      res= args[valpos]->val_str(&tmp);
+      if (res &&
+          (vals[i].string_value.str= my_strndup(res->ptr(), res->length(),
+                                                MYF(MY_WME))))
+      {
+	vals[i].string_value.length= res->length();
+	vals[i].charset= res->charset();
+      }
+      else
+      {
+        args[valpos]->null_value= 1;            // In case of out of memory
+        vals[i].string_value.str= NULL;
+        vals[i].string_value.length= 0;         // just to be safe
+      }
+      break;
+    case DYN_COL_DECIMAL:
+      if ((dres= args[valpos]->val_decimal(&dtmp)))
+      {
+	dynamic_column_prepare_decimal(&vals[i]);
+        DBUG_ASSERT(vals[i].decimal_value.len == dres->len);
+        vals[i].decimal_value.intg= dres->intg;
+        vals[i].decimal_value.frac= dres->frac;
+        vals[i].decimal_value.sign= dres->sign();
+        memcpy(vals[i].decimal_buffer, dres->buf,
+               sizeof(vals[i].decimal_buffer));
+      }
+      else
+      {
+	dynamic_column_prepare_decimal(&vals[i]); // just to be safe
+        DBUG_ASSERT(args[valpos]->null_value);
+      }
+      break;
+    case DYN_COL_DATETIME:
+      args[valpos]->get_date(&vals[i].time_value, TIME_FUZZY_DATE);
+      break;
+    case DYN_COL_DATE:
+      args[valpos]->get_date(&vals[i].time_value, TIME_FUZZY_DATE);
+      break;
+    case DYN_COL_TIME:
+      args[valpos]->get_time(&vals[i].time_value);
+      break;
+    default:
+      DBUG_ASSERT(0);
+      vals[i].type= DYN_COL_NULL;
+    }
+    if (vals[i].type != DYN_COL_NULL && args[valpos]->null_value)
+    {
+      if (vals[i].type == DYN_COL_STRING)
+        my_free(vals[i].string_value.str);
+      vals[i].type= DYN_COL_NULL;
+    }
+  }
+}
+
+void Item_func_dyncol_create::cleanup_arguments()
+{
+  uint column_count= (arg_count / 2);
+  uint i;
+
+  for (i= 0; i < column_count; i++)
+  {
+    if (vals[i].type == DYN_COL_STRING)
+      my_free(vals[i].string_value.str);
+  }
+}
+
+String *Item_func_dyncol_create::val_str(String *str)
+{
+  DYNAMIC_COLUMN col;
+  String *res;
+  uint column_count= (arg_count / 2);
+  enum enum_dyncol_func_result rc;
+  DBUG_ASSERT((arg_count & 0x1) == 0); // even number of arguments
+
+  prepare_arguments();
+
+  if ((rc= dynamic_column_create_many(&col, column_count, nums, vals)))
+  {
+    dynamic_column_error_message(rc);
+    dynamic_column_column_free(&col);
+    res= NULL;
+    null_value= TRUE;
+  }
+  else
+  {
+    /* Move result from DYNAMIC_COLUMN to str_value */
+    char *ptr;
+    size_t length, alloc_length;
+    dynamic_column_reassociate(&col, &ptr, &length, &alloc_length);
+    str_value.reassociate(ptr, (uint32) length, (uint32) alloc_length,
+                          &my_charset_bin);
+    res= &str_value;
+    null_value= FALSE;
+  }
+
+  /* cleanup */
+  cleanup_arguments();
+
+  return res;
+}
+
+void Item_func_dyncol_create::print_arguments(String *str,
+                                              enum_query_type query_type)
+{
+  uint i;
+  uint column_count= (arg_count / 2);
+  for (i= 0; i < column_count; i++)
+  {
+    args[i*2]->print(str, query_type);
+    str->append(',');
+    args[i*2 + 1]->print(str, query_type);
+    switch (defs[i].type) {
+    case DYN_COL_NULL: // automatic type => write nothing
+      break;
+    case DYN_COL_INT:
+      str->append(STRING_WITH_LEN(" AS int"));
+      break;
+    case DYN_COL_UINT:
+      str->append(STRING_WITH_LEN(" AS unsigned int"));
+      break;
+    case DYN_COL_DOUBLE:
+      str->append(STRING_WITH_LEN(" AS double"));
+      break;
+    case DYN_COL_STRING:
+      str->append(STRING_WITH_LEN(" AS char"));
+      if (defs[i].cs)
+      {
+        str->append(STRING_WITH_LEN(" charset "));
+        str->append(defs[i].cs->csname);
+        str->append(' ');
+      }
+      break;
+    case DYN_COL_DECIMAL:
+      str->append(STRING_WITH_LEN(" AS decimal"));
+      break;
+    case DYN_COL_DATETIME:
+      str->append(STRING_WITH_LEN(" AS datetime"));
+      break;
+    case DYN_COL_DATE:
+      str->append(STRING_WITH_LEN(" AS date"));
+      break;
+    case DYN_COL_TIME:
+      str->append(STRING_WITH_LEN(" AS time"));
+      break;
+    }
+    if (i < column_count - 1)
+      str->append(',');
+  }
+}
+
+
+void Item_func_dyncol_create::print(String *str,
+                                    enum_query_type query_type)
+{
+  DBUG_ASSERT((arg_count & 0x1) == 0); // even number of arguments
+  str->append(STRING_WITH_LEN("column_create("));
+  print_arguments(str, query_type);
+  str->append(')');
+}
+
+
+String *Item_func_dyncol_add::val_str(String *str)
+{
+  DYNAMIC_COLUMN col;
+  String *res;
+  uint column_count=  (arg_count / 2);
+  enum enum_dyncol_func_result rc;
+  DBUG_ASSERT((arg_count & 0x1) == 1); // odd number of arguments
+
+  /* We store the packed data last */
+  res= args[arg_count - 1]->val_str(str);
+  if (args[arg_count - 1]->null_value)
+    goto null;
+  init_dynamic_string(&col, NULL, res->length() + STRING_BUFFER_USUAL_SIZE,
+                      STRING_BUFFER_USUAL_SIZE);
+
+  col.length= res->length();
+  memcpy(col.str, res->ptr(), col.length);
+
+  prepare_arguments();
+
+  if ((rc= dynamic_column_update_many(&col, column_count, nums, vals)))
+  {
+    dynamic_column_error_message(rc);
+    dynamic_column_column_free(&col);
+    cleanup_arguments();
+    goto null;
+  }
+
+  {
+    /* Move result from DYNAMIC_COLUMN to str */
+    char *ptr;
+    size_t length, alloc_length;
+    dynamic_column_reassociate(&col, &ptr, &length, &alloc_length);
+    str->reassociate(ptr, (uint32) length, (uint32) alloc_length,
+                     &my_charset_bin);
+    null_value= FALSE;
+  }
+
+  /* cleanup */
+  dynamic_column_column_free(&col);
+  cleanup_arguments();
+
+  return str;
+
+null:
+  null_value= TRUE;
+  return NULL;
+}
+
+
+void Item_func_dyncol_add::print(String *str,
+                                 enum_query_type query_type)
+{
+  DBUG_ASSERT((arg_count & 0x1) == 1); // odd number of arguments
+  str->append(STRING_WITH_LEN("column_create("));
+  args[arg_count - 1]->print(str, query_type);
+  str->append(',');
+  print_arguments(str, query_type);
+  str->append(')');
+}
+
+
+/**
+  Get value for a column stored in a dynamic column
+
+  @notes
+  This function ensures that null_value is set correctly
+*/
+
+bool Item_dyncol_get::get_dyn_value(DYNAMIC_COLUMN_VALUE *val, String *tmp)
+{
+  DYNAMIC_COLUMN dyn_str;
+  String *res;
+  longlong num;
+  enum enum_dyncol_func_result rc;
+
+  num= args[1]->val_int();
+  if (args[1]->null_value || num < 0 || num > INT_MAX)
+  {
+    null_value= 1;
+    return 1;
+  }
+
+  res= args[0]->val_str(tmp);
+  if (args[0]->null_value)
+  {
+    null_value= 1;
+    return 1;
+  }
+
+  dyn_str.str=   (char*) res->ptr();
+  dyn_str.length= res->length();
+  if ((rc= dynamic_column_get(&dyn_str, (uint) num, val)))
+  {
+    dynamic_column_error_message(rc);
+    null_value= 1;
+    return 1;
+  }
+
+  null_value= 0;
+  return 0;                                     // ok
+}
+
+
+String *Item_dyncol_get::val_str(String *str_result)
+{
+  DYNAMIC_COLUMN_VALUE val;
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+
+  if (get_dyn_value(&val, &tmp))
+    return NULL;
+
+  switch (val.type) {
+  case DYN_COL_NULL:
+    goto null;
+  case DYN_COL_INT:
+  case DYN_COL_UINT:
+    str_result->set_int(val.long_value, test(val.type == DYN_COL_UINT),
+                       &my_charset_latin1);
+    break;
+  case DYN_COL_DOUBLE:
+    str_result->set_real(val.double_value, NOT_FIXED_DEC, &my_charset_latin1);
+    break;
+  case DYN_COL_STRING:
+    if ((char*) tmp.ptr() <= val.string_value.str &&
+        (char*) tmp.ptr() + tmp.length() >= val.string_value.str)
+    {
+      /* value is allocated in tmp buffer; We have to make a copy */
+      str_result->copy(val.string_value.str, val.string_value.length,
+                      val.charset);
+    }
+    else
+    {
+      /*
+        It's safe to use the current value because it's either pointing
+        into a field or in a buffer for another item and this buffer
+        is not going to be deleted during expression evaluation
+      */
+      str_result->set(val.string_value.str, val.string_value.length,
+                      val.charset);
+    }
+    break;
+  case DYN_COL_DECIMAL:
+  {
+    int res;
+    int length=
+      my_decimal_string_length((const my_decimal*)&val.decimal_value);
+    if (str_result->alloc(length))
+      goto null;
+    if ((res= decimal2string(&val.decimal_value, (char*) str_result->ptr(),
+                             &length, 0, 0, ' ')) != E_DEC_OK)
+    {
+      char buff[40];
+      int len= sizeof(buff);
+      DBUG_ASSERT(length < (int)sizeof(buff));
+      decimal2string(&val.decimal_value, buff, &len, 0, 0, ' ');
+      decimal_operation_results(res, buff, "CHAR");
+    }
+    str_result->set_charset(&my_charset_latin1);
+    str_result->length(length);
+    break;
+  }
+  case DYN_COL_DATETIME:
+  case DYN_COL_DATE:
+  case DYN_COL_TIME:
+  {
+    int length;
+    /*
+      We use AUTO_SEC_PART_DIGITS here to ensure that we do not loose
+      any microseconds from the data. This is safe to do as we are
+      asked to return the time argument as a string.
+    */
+    if (str_result->alloc(MAX_DATE_STRING_REP_LENGTH) ||
+        !(length= my_TIME_to_str(&val.time_value, (char*) str_result->ptr(),
+                                 AUTO_SEC_PART_DIGITS)))
+      goto null;
+    str_result->set_charset(&my_charset_latin1);
+    str_result->length(length);
+    break;
+  }
+  }
+  return str_result;
+
+null:
+  null_value= TRUE;
+  return 0;
+}
+
+
+longlong Item_dyncol_get::val_int()
+{
+  DYNAMIC_COLUMN_VALUE val;
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+
+  if (get_dyn_value(&val, &tmp))
+    return 0;
+
+  switch (val.type) {
+  case DYN_COL_NULL:
+    goto null;
+  case DYN_COL_UINT:
+    unsigned_flag= 1;            // Make it possible for caller to detect sign
+    return val.long_value;
+  case DYN_COL_INT:
+    unsigned_flag= 0;            // Make it possible for caller to detect sign
+    return val.long_value;
+  case DYN_COL_DOUBLE:
+  {
+    bool error;
+    longlong num;
+
+    num= double_to_longlong(val.double_value, unsigned_flag, &error);
+    if (error)
+    {
+      char buff[30];
+      sprintf(buff, "%lg", val.double_value);
+      push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                          ER_DATA_OVERFLOW,
+                          ER(ER_DATA_OVERFLOW),
+                          buff,
+                          unsigned_flag ? "UNSIGNED INT" : "INT");
+    }
+    return num;
+  }
+  case DYN_COL_STRING:
+  {
+    int error;
+    longlong num;
+    char *end= val.string_value.str + val.string_value.length, *org_end= end;
+
+    num= my_strtoll10(val.string_value.str, &end, &error);
+    if (end != org_end || error > 0)
+    {
+      char buff[80];
+      strmake(buff, val.string_value.str, min(sizeof(buff)-1,
+                                              val.string_value.length));
+      push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                          ER_BAD_DATA,
+                          ER(ER_BAD_DATA),
+                          buff,
+                          unsigned_flag ? "UNSIGNED INT" : "INT");
+    }
+    unsigned_flag= error >= 0;
+    return num;
+  }
+  case DYN_COL_DECIMAL:
+  {
+    longlong num;
+    my_decimal2int(E_DEC_FATAL_ERROR, &val.decimal_value, unsigned_flag,
+                   &num);
+    return num;
+  }
+  case DYN_COL_DATETIME:
+  case DYN_COL_DATE:
+  case DYN_COL_TIME:
+    unsigned_flag= !val.time_value.neg;
+    if (unsigned_flag)
+      return TIME_to_ulonglong(&val.time_value);
+    else
+      return -(longlong)TIME_to_ulonglong(&val.time_value);
+  }
+
+null:
+  null_value= TRUE;
+  return 0;
+}
+
+
+double Item_dyncol_get::val_real()
+{
+  DYNAMIC_COLUMN_VALUE val;
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+
+  if (get_dyn_value(&val, &tmp))
+    return 0.0;
+
+  switch (val.type) {
+  case DYN_COL_NULL:
+    goto null;
+  case DYN_COL_UINT:
+    return ulonglong2double(val.ulong_value);
+  case DYN_COL_INT:
+    return (double) val.long_value;
+  case DYN_COL_DOUBLE:
+    return (double) val.double_value;
+  case DYN_COL_STRING:
+  {
+    int error;
+    char *end;
+    double res= my_strntod(val.charset, (char*) val.string_value.str,
+                           val.string_value.length, &end, &error);
+
+    if (end != (char*) val.string_value.str + val.string_value.length ||
+        error)
+    {
+      char buff[80];
+      strmake(buff, val.string_value.str, min(sizeof(buff)-1,
+                                              val.string_value.length));
+      push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                          ER_BAD_DATA,
+                          ER(ER_BAD_DATA),
+                          buff, "DOUBLE");
+    }
+    return res;
+  }
+  case DYN_COL_DECIMAL:
+  {
+    double res;
+    /* This will always succeed */
+    decimal2double(&val.decimal_value, &res);
+    return res;
+  }
+  case DYN_COL_DATETIME:
+  case DYN_COL_DATE:
+  case DYN_COL_TIME:
+    return TIME_to_double(&val.time_value);
+  }
+
+null:
+  null_value= TRUE;
+  return 0.0;
+}
+
+
+my_decimal *Item_dyncol_get::val_decimal(my_decimal *decimal_value)
+{
+  DYNAMIC_COLUMN_VALUE val;
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+
+  if (get_dyn_value(&val, &tmp))
+    return NULL;
+
+  switch (val.type) {
+  case DYN_COL_NULL:
+    goto null;
+  case DYN_COL_UINT:
+    int2my_decimal(E_DEC_FATAL_ERROR, val.long_value, TRUE, decimal_value);
+    break;
+  case DYN_COL_INT:
+    int2my_decimal(E_DEC_FATAL_ERROR, val.long_value, FALSE, decimal_value);
+    break;
+  case DYN_COL_DOUBLE:
+    double2my_decimal(E_DEC_FATAL_ERROR, val.double_value, decimal_value);
+    break;
+  case DYN_COL_STRING:
+  {
+    int rc;
+    rc= str2my_decimal(0, val.string_value.str, val.string_value.length,
+                       val.charset, decimal_value);
+    char buff[80];
+    strmake(buff, val.string_value.str, min(sizeof(buff)-1,
+                                            val.string_value.length));
+    if (rc != E_DEC_OK)
+    {
+      push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                          ER_BAD_DATA,
+                          ER(ER_BAD_DATA),
+                          buff, "DECIMAL");
+    }
+    break;
+  }
+  case DYN_COL_DECIMAL:
+    decimal2my_decimal(&val.decimal_value, decimal_value);
+    break;
+  case DYN_COL_DATETIME:
+  case DYN_COL_DATE:
+  case DYN_COL_TIME:
+    decimal_value= seconds2my_decimal(val.time_value.neg,
+                                      TIME_to_ulonglong(&val.time_value),
+                                      val.time_value.second_part,
+                                      decimal_value);
+    break;
+  }
+  return decimal_value;
+
+null:
+  null_value= TRUE;
+  return 0;
+}
+
+
+bool Item_dyncol_get::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
+{
+  DYNAMIC_COLUMN_VALUE val;
+  char buff[STRING_BUFFER_USUAL_SIZE];
+  String tmp(buff, sizeof(buff), &my_charset_bin);
+  bool signed_value= 0;
+
+  if (get_dyn_value(&val, &tmp))
+    return 1;                                   // Error
+
+  switch (val.type) {
+  case DYN_COL_NULL:
+    goto null;
+  case DYN_COL_INT:
+    signed_value= 1;                                  // For error message
+    /* fall_trough */
+  case DYN_COL_UINT:
+    if (signed_value || val.ulong_value <= LONGLONG_MAX)
+    {
+      if (int_to_datetime_with_warn(val.ulong_value, ltime, fuzzy_date,
+                                   0 /* TODO */))
+        goto null;
+      return 0;
+    }
+    /* let double_to_datetime_with_warn() issue the warning message */
+    val.double_value= static_cast<double>(ULONGLONG_MAX);
+    /* fall_trough */
+  case DYN_COL_DOUBLE:
+    if (double_to_datetime_with_warn(val.double_value, ltime, fuzzy_date,
+                                     0 /* TODO */))
+      goto null;
+    return 0;
+  case DYN_COL_DECIMAL:
+    if (decimal_to_datetime_with_warn((my_decimal*)&val.decimal_value, ltime,
+                                      fuzzy_date, 0 /* TODO */))
+      goto null;
+    return 0;
+  case DYN_COL_STRING:
+    if (str_to_datetime_with_warn(&my_charset_numeric,
+                                  val.string_value.str,
+                                  val.string_value.length,
+                                  ltime, fuzzy_date) <= MYSQL_TIMESTAMP_ERROR)
+      goto null;
+    return 0;
+  case DYN_COL_DATETIME:
+  case DYN_COL_DATE:
+  case DYN_COL_TIME:
+    *ltime= val.time_value;
+    return 0;
+  }
+
+null:
+  null_value= TRUE;
+  return 1;
+}
+
+
+void Item_dyncol_get::print(String *str, enum_query_type query_type)
+{
+  str->append(STRING_WITH_LEN("column_get("));
+  args[0]->print(str, query_type);
+  str->append(',');
+  args[1]->print(str, query_type);
+  str->append(')');
+}
+
+
+String *Item_func_dyncol_list::val_str(String *str)
+{
+  uint i;
+  enum enum_dyncol_func_result rc;
+  DYNAMIC_ARRAY arr;
+  DYNAMIC_COLUMN col;
+  String *res= args[0]->val_str(str);
+
+  if (args[0]->null_value)
+    goto null;
+  col.length= res->length();
+  /* We do not change the string, so could do this trick */
+  col.str= (char *)res->ptr();
+  if ((rc= dynamic_column_list(&col, &arr)))
+  {
+    dynamic_column_error_message(rc);
+    delete_dynamic(&arr);
+    goto null;
+  }
+
+  /*
+    We support elements from 0 - 65536, so max size for one element is
+    6 (including ,).
+  */
+  if (str->alloc(arr.elements * 6))
+    goto null;
+
+  str->length(0);
+  for (i= 0; i < arr.elements; i++)
+  {
+    str->qs_append(*dynamic_element(&arr, i, uint*));
+    if (i < arr.elements - 1)
+      str->qs_append(',');
+  }
+
+  null_value= FALSE;
+  delete_dynamic(&arr);
+  return str;
+
+null:
+  null_value= TRUE;
+  return NULL;
+}
diff --git a/sql/item_strfunc.h b/sql/item_strfunc.h
index ef059ae1780..b15179e641b 100644
--- a/sql/item_strfunc.h
+++ b/sql/item_strfunc.h
@@ -33,8 +33,15 @@ protected:
      character set. No memory is allocated.
      @retval A pointer to the str_value member.
    */
-  String *make_empty_result() {
-    str_value.set("", 0, collation.collation);
+  String *make_empty_result()
+  {
+    /*
+      Reset string length to an empty string. We don't use str_value.set() as
+      we don't want to free and potentially have to reallocate the buffer
+      for each call.
+    */
+    str_value.length(0);
+    str_value.set_charset(collation.collation);
     return &str_value; 
   }
 public:
@@ -963,6 +970,75 @@ public:
   }
 };
 
+
+class Item_func_dyncol_create: public Item_str_func
+{
+protected:
+  DYNCALL_CREATE_DEF *defs;
+  DYNAMIC_COLUMN_VALUE *vals;
+  uint *nums;
+  void prepare_arguments();
+  void cleanup_arguments();
+  void print_arguments(String *str, enum_query_type query_type);
+public:
+  Item_func_dyncol_create(List<Item> &args, DYNCALL_CREATE_DEF *dfs);
+  bool fix_fields(THD *thd, Item **ref);
+  void fix_length_and_dec();
+  const char *func_name() const{ return "column_create"; }
+  String *val_str(String *);
+  virtual void print(String *str, enum_query_type query_type);
+};
+
+
+class Item_func_dyncol_add: public Item_func_dyncol_create
+{
+public:
+  Item_func_dyncol_add(List<Item> &args, DYNCALL_CREATE_DEF *dfs)
+    :Item_func_dyncol_create(args, dfs)
+  {}
+  const char *func_name() const{ return "column_add"; }
+  String *val_str(String *);
+  virtual void print(String *str, enum_query_type query_type);
+};
+
+
+/*
+  The following functions is always called from an Item_cast function
+*/
+
+class Item_dyncol_get: public Item_str_func
+{
+public:
+  Item_dyncol_get(Item *str, Item *num)
+    :Item_str_func(str, num)
+  {
+    max_length= MAX_DYNAMIC_COLUMN_LENGTH;
+  }
+  void fix_length_and_dec()
+  { maybe_null= 1; }
+  /* Mark that collation can change between calls */
+  bool dynamic_result() { return 1; }
+
+  const char *func_name() const { return "column_get"; }
+  String *val_str(String *);
+  longlong val_int();
+  double val_real();
+  my_decimal *val_decimal(my_decimal *);
+  bool get_dyn_value(DYNAMIC_COLUMN_VALUE *val, String *tmp);
+  bool get_date(MYSQL_TIME *ltime,uint fuzzydate);
+  void print(String *str, enum_query_type query_type);
+};
+
+
+class Item_func_dyncol_list: public Item_str_func
+{
+public:
+  Item_func_dyncol_list(Item *str) :Item_str_func(str) {};
+  void fix_length_and_dec() { maybe_null= 1; max_length= MAX_BLOB_WIDTH; };
+  const char *func_name() const{ return "column_list"; }
+  String *val_str(String *);
+};
+
 extern String my_empty_string;
 
 #endif /* ITEM_STRFUNC_INCLUDED */
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 44436a4a467..ee2cc861ae2 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2002, 2010, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -39,14 +39,21 @@
 #include "sql_select.h"
 #include "sql_parse.h"                          // check_stack_overrun
 
+double get_post_group_estimate(JOIN* join, double join_op_rows);
+
 
 Item_subselect::Item_subselect():
-  Item_result_field(), value_assigned(0), thd(0), substitution(0),
-  expr_cache(0), engine(0), old_engine(0), used_tables_cache(0),
-  have_to_be_excluded(0), const_item_cache(1), inside_first_fix_fields(0),
-  done_first_fix_fields(FALSE), eliminated(FALSE), engine_changed(0),
-  changed(0), is_correlated(FALSE)
-{
+  Item_result_field(), value_assigned(0), own_engine(0), thd(0), old_engine(0), 
+  used_tables_cache(0), have_to_be_excluded(0), const_item_cache(1),
+  inside_first_fix_fields(0), done_first_fix_fields(FALSE), 
+  expr_cache(0), forced_const(FALSE), substitution(0), engine(0), eliminated(FALSE),
+  engine_changed(0), changed(0), is_correlated(FALSE)
+{
+  DBUG_ENTER("Item_subselect::Item_subselect");
+  DBUG_PRINT("enter", ("this: 0x%lx", (ulong) this));
+#ifndef DBUG_OFF
+  exec_counter= 0;
+#endif
   with_subselect= 1;
   reset();
   /*
@@ -54,6 +61,7 @@ Item_subselect::Item_subselect():
     (i.e. some rows will be found returned)
   */
   null_value= TRUE;
+  DBUG_VOID_RETURN;
 }
 
 
@@ -66,8 +74,10 @@ void Item_subselect::init(st_select_lex *select_lex,
   */
 
   DBUG_ENTER("Item_subselect::init");
-  DBUG_PRINT("enter", ("select_lex: 0x%lx", (long) select_lex));
+  DBUG_PRINT("enter", ("select_lex: 0x%lx  this: 0x%lx",
+                       (ulong) select_lex, (ulong) this));
   unit= select_lex->master_unit();
+  thd= unit->thd;
 
   if (unit->item)
   {
@@ -76,10 +86,10 @@ void Item_subselect::init(st_select_lex *select_lex,
       => we do not copy old_engine here
     */
     engine= unit->item->engine;
+    own_engine= FALSE;
     parsing_place= unit->item->parsing_place;
-    unit->item->engine= 0;
-    unit->item= this;
-    engine->change_result(this, result);
+    thd->change_item_tree((Item**)&unit->item, this);
+    engine->change_result(this, result, TRUE);
   }
   else
   {
@@ -92,9 +102,9 @@ void Item_subselect::init(st_select_lex *select_lex,
                     NO_MATTER :
                     outer_select->parsing_place);
     if (unit->is_union())
-      engine= new subselect_union_engine(unit, result, this);
+      engine= new subselect_union_engine(thd, unit, result, this);
     else
-      engine= new subselect_single_select_engine(select_lex, result, this);
+      engine= new subselect_single_select_engine(thd, select_lex, result, this);
   }
   {
     SELECT_LEX *upper= unit->outer_select();
@@ -103,6 +113,7 @@ void Item_subselect::init(st_select_lex *select_lex,
     /* The subquery is an expression cache candidate */
     upper->expr_cache_may_be_used[upper->parsing_place]= TRUE;
   }
+  DBUG_PRINT("info", ("engine: 0x%lx", (ulong)engine));
   DBUG_VOID_RETURN;
 }
 
@@ -125,10 +136,14 @@ void Item_subselect::cleanup()
   }
   if (engine)
     engine->cleanup();
-  depends_on.empty();
   reset();
   value_assigned= 0;
   expr_cache= 0;
+  forced_const= FALSE;
+  DBUG_PRINT("info", ("exec_counter: %d", exec_counter));
+#ifndef DBUG_OFF
+  exec_counter= 0;
+#endif
   DBUG_VOID_RETURN;
 }
 
@@ -152,21 +167,47 @@ void Item_in_subselect::cleanup()
     left_expr_cache= NULL;
   }
   first_execution= TRUE;
-  is_constant= FALSE;
+  if (in_strategy & SUBS_MATERIALIZATION)
+    in_strategy= 0;
+  pushed_cond_guards= NULL;
   Item_subselect::cleanup();
   DBUG_VOID_RETURN;
 }
 
+
+void Item_allany_subselect::cleanup()
+{
+  /*
+    The MAX/MIN transformation through injection is reverted through the
+    change_item_tree() mechanism. Revert the select_lex object of the
+    query to its initial state.
+  */
+  for (SELECT_LEX *sl= unit->first_select();
+       sl; sl= sl->next_select())
+    if (in_strategy & SUBS_MAXMIN_INJECTED)
+      sl->with_sum_func= false;
+  Item_in_subselect::cleanup();
+
+}
+
+
 Item_subselect::~Item_subselect()
 {
-  delete engine;
+  DBUG_ENTER("Item_subselect::~Item_subselect");
+  DBUG_PRINT("enter", ("this: 0x%lx", (ulong) this));
+  if (own_engine)
+    delete engine;
+  else
+    engine->cleanup();
+  engine= NULL;
+  DBUG_VOID_RETURN;
 }
 
-Item_subselect::trans_res
+bool
 Item_subselect::select_transformer(JOIN *join)
 {
   DBUG_ENTER("Item_subselect::select_transformer");
-  DBUG_RETURN(RES_OK);
+  DBUG_RETURN(false);
 }
 
 
@@ -177,7 +218,8 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
   bool res;
 
   DBUG_ASSERT(fixed == 0);
-  engine->set_thd((thd= thd_param));
+  /* There is no reason to get a different THD. */
+  DBUG_ASSERT(thd == thd_param);
   if (!done_first_fix_fields)
   {
     done_first_fix_fields= TRUE;
@@ -200,11 +242,7 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
   {
     // all transformation is done (used by prepared statements)
     changed= 1;
-  inside_first_fix_fields= FALSE;
-
-
-    // all transformation is done (used by prepared statements)
-    changed= 1;
+    inside_first_fix_fields= FALSE;
 
     /*
       Substitute the current item with an Item_in_optimizer that was
@@ -214,11 +252,14 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
     */
     if (substitution)
     {
-      // did we changed top item of WHERE condition
+      /*
+        If the top item of the WHERE/HAVING condition changed,
+        set correct WHERE/HAVING for PS.
+      */
       if (unit->outer_select()->where == (*ref))
-	unit->outer_select()->where= substitution; // correct WHERE for PS
+        unit->outer_select()->where= substitution;
       else if (unit->outer_select()->having == (*ref))
-	unit->outer_select()->having= substitution; // correct HAVING for PS
+        unit->outer_select()->having= substitution;
 
       (*ref)= substitution;
       substitution->name= name;
@@ -229,13 +270,13 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
       if (!(*ref)->fixed)
 	res= (*ref)->fix_fields(thd, ref);
       goto end;
-//psergey-merge:  done_first_fix_fields= FALSE;
+
     }
     // Is it one field subselect?
     if (engine->cols() > max_columns)
     {
       my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
-//psergey-merge:  done_first_fix_fields= FALSE;
+
       goto end;
     }
     fix_length_and_dec();
@@ -253,6 +294,7 @@ bool Item_subselect::fix_fields(THD *thd_param, Item **ref)
 
 end:
   done_first_fix_fields= FALSE;
+  inside_first_fix_fields= FALSE;
   thd->where= save_where;
   return res;
 }
@@ -278,6 +320,69 @@ bool Item_subselect::mark_as_eliminated_processor(uchar *arg)
 }
 
 
+/**
+  Remove a subselect item from its unit so that the unit no longer
+  represents a subquery.
+
+  @param arg  unused parameter
+
+  @return
+    FALSE to force the evaluation of the processor for the subsequent items.
+*/
+
+bool Item_subselect::eliminate_subselect_processor(uchar *arg)
+{
+  unit->item= NULL;
+  unit->exclude_from_tree();
+  eliminated= TRUE;
+  return FALSE;
+}
+
+
+/**
+  Adjust the master select of the subquery to be the fake_select which
+  represents the whole UNION right above the subquery, instead of the
+  last query of the UNION.
+
+  @param arg  pointer to the fake select
+
+  @return
+    FALSE to force the evaluation of the processor for the subsequent items.
+*/
+
+bool Item_subselect::set_fake_select_as_master_processor(uchar *arg)
+{
+  SELECT_LEX *fake_select= (SELECT_LEX*) arg;
+  /*
+    Move the st_select_lex_unit of a subquery from a global ORDER BY clause to
+    become a direct child of the fake_select of a UNION. In this way the
+    ORDER BY that is applied to the temporary table that contains the result of
+    the whole UNION, and all columns in the subquery are resolved against this
+    table. The transformation is applied only for immediate child subqueries of
+    a UNION query.
+  */
+  if (unit->outer_select()->master_unit()->fake_select_lex == fake_select)
+  {
+    /*
+      Set the master of the subquery to be the fake select (i.e. the whole
+      UNION), instead of the last query in the UNION.
+    */
+    fake_select->add_slave(unit);
+    DBUG_ASSERT(unit->outer_select() == fake_select);
+    /* Adjust the name resolution context hierarchy accordingly. */
+    for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select())
+      sl->context.outer_context= &(fake_select->context);
+    /*
+      Undo Item_subselect::eliminate_subselect_processor because at that phase
+      we don't know yet that the ORDER clause will be moved to the fake select.
+    */
+    unit->item= this;
+    eliminated= FALSE;
+  }
+  return FALSE;
+}
+
+
 bool Item_subselect::mark_as_dependent(THD *thd, st_select_lex *select, 
                                        Item *item)
 {
@@ -395,12 +500,11 @@ void Item_subselect::recalc_used_tables(st_select_lex *new_parent,
           upper->item->walk(&Item::enumerate_field_refs_processor, FALSE,
                             (uchar*)&fixer);
           used_tables_cache |= fixer.used_tables;
-          /*
+/*
           if (after_pullout)
             upper->item->fix_after_pullout(new_parent, &(upper->item));
           upper->item->update_used_tables();
-          used_tables_cache |= upper->item->used_tables();
-          */
+*/          
         }
       }
     }
@@ -475,6 +579,9 @@ bool Item_subselect::exec()
 
   bool res= engine->exec();
 
+#ifndef DBUG_OFF
+  ++exec_counter;
+#endif
   if (engine_changed)
   {
     engine_changed= 0;
@@ -485,6 +592,61 @@ bool Item_subselect::exec()
 }
 
 
+void Item_subselect::get_cache_parameters(List<Item> &parameters)
+{
+  Collect_deps_prm prm= { unit->first_select()->nest_level, &parameters };
+  walk(&Item::collect_outer_ref_processor, TRUE, (uchar*)&prm);
+}
+
+int Item_in_subselect::optimize(double *out_rows, double *cost)
+{
+  int res;
+  DBUG_ENTER("Item_in_subselect::optimize");
+  SELECT_LEX *save_select= thd->lex->current_select;
+  JOIN *join= unit->first_select()->join;
+
+  thd->lex->current_select= join->select_lex;
+  if ((res= join->optimize()))
+    DBUG_RETURN(res);
+
+  /* Calculate #rows and cost of join execution */
+  join->get_partial_cost_and_fanout(join->table_count - join->const_tables, 
+                                    table_map(-1),
+                                    cost, out_rows);
+
+  /*
+    Adjust join output cardinality. There can be these cases:
+    - Have no GROUP BY and no aggregate funcs: we won't get into this 
+      function because such join will be processed as a merged semi-join 
+      (TODO: does it really mean we don't need to handle such cases here at 
+       all? put ASSERT)
+    - Have no GROUP BY but have aggregate funcs: output is 1 record.
+    - Have GROUP BY and have (or not) aggregate funcs:  need to adjust output 
+      cardinality.
+  */
+  thd->lex->current_select= save_select;
+  if (!join->group_list && !join->group_optimized_away &&
+      join->tmp_table_param.sum_func_count)
+  {
+    DBUG_PRINT("info",("Materialized join will have only 1 row (it has "
+                       "aggregates but no GROUP BY"));
+    *out_rows= 1;
+  }
+  
+  /* Now with grouping */
+  if (join->group_list)
+  {
+    DBUG_PRINT("info",("Materialized join has grouping, trying to estimate it"));
+    double output_rows= get_post_group_estimate(join, *out_rows);
+    DBUG_PRINT("info",("Got value of %g", output_rows));
+    *out_rows= output_rows;
+  }
+
+  DBUG_RETURN(res);
+
+}
+
+
 /**
   Check if an expression cache is needed for this subquery
 
@@ -500,7 +662,7 @@ bool Item_subselect::exec()
 
 bool Item_subselect::expr_cache_is_needed(THD *thd)
 {
-  return (depends_on.elements &&
+  return ((engine->uncacheable() & UNCACHEABLE_DEPENDENT) &&
           engine->cols() == 1 &&
           optimizer_flag(thd, OPTIMIZER_SWITCH_SUBQUERY_CACHE) &&
           !(engine->uncacheable() & (UNCACHEABLE_RAND |
@@ -528,8 +690,7 @@ bool Item_subselect::expr_cache_is_needed(THD *thd)
 
 bool Item_in_subselect::expr_cache_is_needed(THD *thd)
 {
-  return (depends_on.elements &&
-          optimizer_flag(thd, OPTIMIZER_SWITCH_SUBQUERY_CACHE) &&
+  return (optimizer_flag(thd, OPTIMIZER_SWITCH_SUBQUERY_CACHE) &&
           !(engine->uncacheable() & (UNCACHEABLE_RAND |
                                      UNCACHEABLE_SIDEEFFECT)));
 }
@@ -554,7 +715,7 @@ bool Item_in_subselect::exec()
     - on a cost-based basis, that takes into account the cost of a cache
       lookup, the cache hit rate, and the savings per cache hit.
   */
-  if (!left_expr_cache && exec_method == MATERIALIZATION)
+  if (!left_expr_cache && (in_strategy & SUBS_MATERIALIZATION))
     init_left_expr_cache();
 
   /*
@@ -607,12 +768,15 @@ Item *Item_subselect::get_tmp_table_item(THD *thd_arg)
 
 void Item_subselect::update_used_tables()
 {
-  recalc_used_tables(parent_select, FALSE);
-  if (!engine->uncacheable())
+  if (!forced_const)
   {
-    // did all used tables become static?
-    if (!(used_tables_cache & ~engine->upper_select_const_tables()))
-      const_item_cache= 1;
+    recalc_used_tables(parent_select, FALSE);
+    if (!engine->uncacheable())
+    {
+      // did all used tables become static?
+      if (!(used_tables_cache & ~engine->upper_select_const_tables()))
+        const_item_cache= 1;
+    }
   }
 }
 
@@ -678,7 +842,7 @@ Item_maxmin_subselect::Item_maxmin_subselect(THD *thd_param,
     of Items belonged to subquery, which will be not repeated
   */
   used_tables_cache= parent->get_used_tables_cache();
-  const_item_cache= parent->get_const_item_cache();
+  const_item_cache= parent->const_item();
 
   /*
     this subquery always creates during preparation, so we can assign
@@ -716,8 +880,7 @@ void Item_maxmin_subselect::print(String *str, enum_query_type query_type)
 
 void Item_singlerow_subselect::reset()
 {
-  eliminated= FALSE;
-  null_value= TRUE;
+  Item_subselect::reset();
   if (value)
     value->null_value= TRUE;
 }
@@ -732,13 +895,17 @@ void Item_singlerow_subselect::reset()
   - switch off this optimization for prepare statement,
   because we do not rollback this changes.
   Make rollback for it, or special name resolving mode in 5.0.
+
+  @param join  Join object of the subquery (i.e. 'child' join).
+
+  @retval false  The subquery was transformed
 */
-Item_subselect::trans_res
+bool
 Item_singlerow_subselect::select_transformer(JOIN *join)
 {
   DBUG_ENTER("Item_singlerow_subselect::select_transformer");
   if (changed)
-    DBUG_RETURN(RES_OK);
+    DBUG_RETURN(false);
 
   SELECT_LEX *select_lex= join->select_lex;
   Query_arena *arena= thd->stmt_arena;
@@ -765,7 +932,6 @@ Item_singlerow_subselect::select_transformer(JOIN *join)
       !arena->is_stmt_prepare_or_first_sp_execute()
       )
   {
-
     have_to_be_excluded= 1;
     if (thd->lex->describe)
     {
@@ -781,18 +947,14 @@ Item_singlerow_subselect::select_transformer(JOIN *join)
     */
     substitution->walk(&Item::remove_dependence_processor, 0,
 		       (uchar *) select_lex->outer_select());
-    DBUG_RETURN(RES_REDUCE);
   }
-  DBUG_RETURN(RES_OK);
+  DBUG_RETURN(false);
 }
 
 
 void Item_singlerow_subselect::store(uint i, Item *item)
 {
   row[i]->store(item);
-  //psergey-merge: can do without that: row[i]->cache_value();
-  //psergey-backport-timours: ^ really, without that ^ 
-  //psergey-try-merge-again:
   row[i]->cache_value();
 }
 
@@ -861,7 +1023,7 @@ Item* Item_singlerow_subselect::expr_cache_insert_transformer(uchar *thd_arg)
     DBUG_RETURN(expr_cache);
 
   if (expr_cache_is_needed(thd) &&
-      (expr_cache= set_expr_cache(thd, depends_on)))
+      (expr_cache= set_expr_cache(thd)))
     DBUG_RETURN(expr_cache);
   DBUG_RETURN(this);
 }
@@ -1014,11 +1176,14 @@ bool Item_in_subselect::test_limit(st_select_lex_unit *unit_arg)
 Item_in_subselect::Item_in_subselect(Item * left_exp,
 				     st_select_lex *select_lex):
   Item_exists_subselect(), left_expr_cache(0), first_execution(TRUE),
-  is_constant(FALSE), optimizer(0), pushed_cond_guards(NULL),
-  exec_method(NOT_TRANSFORMED), upper_item(0)
+  optimizer(0), pushed_cond_guards(NULL), in_strategy(0),
+  is_jtbm_merged(FALSE), is_flattenable_semijoin(FALSE),
+  is_registered_semijoin(FALSE), 
+  upper_item(0)
 {
   DBUG_ENTER("Item_in_subselect::Item_in_subselect");
   left_expr= left_exp;
+  func= &eq_creator;
   init(select_lex, new select_exists_subselect(this));
   max_columns= UINT_MAX;
   maybe_null= 1;
@@ -1053,13 +1218,42 @@ Item_allany_subselect::Item_allany_subselect(Item * left_exp,
 }
 
 
+/**
+  Initialize length and decimals for EXISTS  and inherited (IN/ALL/ANY)
+  subqueries
+*/
+
+void Item_exists_subselect::init_length_and_dec()
+{
+  decimals= 0;
+  max_length= 1;
+  max_columns= engine->cols();
+}
+
+
 void Item_exists_subselect::fix_length_and_dec()
 {
-   decimals= 0;
-   max_length= 1;
-   max_columns= engine->cols();
-  /* We need only 1 row to determine existence */
+  DBUG_ENTER("Item_exists_subselect::fix_length_and_dec");
+  init_length_and_dec();
+  /*
+    We need only 1 row to determine existence (i.e. any EXISTS that is not
+    an IN always requires LIMIT 1)
+  */
   unit->global_parameters->select_limit= new Item_int((int32) 1);
+  DBUG_PRINT("info", ("Set limit to 1"));
+  DBUG_VOID_RETURN;
+}
+
+
+void Item_in_subselect::fix_length_and_dec()
+{
+  DBUG_ENTER("Item_in_subselect::fix_length_and_dec");
+  init_length_and_dec();
+  /*
+    Unlike Item_exists_subselect, LIMIT 1 is set later for
+    Item_in_subselect, depending on the chosen strategy.
+  */
+  DBUG_VOID_RETURN;
 }
 
 
@@ -1090,7 +1284,7 @@ Item* Item_exists_subselect::expr_cache_insert_transformer(uchar *thd_arg)
     DBUG_RETURN(expr_cache);
 
   if (substype() == EXISTS_SUBS && expr_cache_is_needed(thd) &&
-      (expr_cache= set_expr_cache(thd, depends_on)))
+      (expr_cache= set_expr_cache(thd)))
     DBUG_RETURN(expr_cache);
   DBUG_RETURN(this);
 }
@@ -1244,9 +1438,9 @@ String *Item_in_subselect::val_str(String *str)
 bool Item_in_subselect::val_bool()
 {
   DBUG_ASSERT(fixed == 1);
-  null_value= was_null= FALSE;
-  if (is_constant)
+  if (forced_const)
     return value;
+  null_value= was_null= FALSE;
   if (exec())
   {
     reset();
@@ -1278,59 +1472,27 @@ my_decimal *Item_in_subselect::val_decimal(my_decimal *decimal_value)
 }
 
 
-/* 
-  Rewrite a single-column IN/ALL/ANY subselect
-
-  SYNOPSIS
-    Item_in_subselect::single_value_transformer()
-      join  Join object of the subquery (i.e. 'child' join).
-      func  Subquery comparison creator
-
-  DESCRIPTION
-    Rewrite a single-column subquery using rule-based approach. The subquery
-    
-       oe $cmp$ (SELECT ie FROM ... WHERE subq_where ... HAVING subq_having)
-    
-    First, try to convert the subquery to scalar-result subquery in one of
-    the forms:
-    
-       - oe $cmp$ (SELECT MAX(...) )  // handled by Item_singlerow_subselect
-       - oe $cmp$ <max>(SELECT ...)   // handled by Item_maxmin_subselect
-   
-    If that fails, the subquery will be handled with class Item_in_optimizer, 
-    Inject the predicates into subquery, i.e. convert it to:
-
-    - If the subquery has aggregates, GROUP BY, or HAVING, convert to
+/**
+  Prepare a single-column IN/ALL/ANY subselect for rewriting.
 
-       SELECT ie FROM ...  HAVING subq_having AND 
-                                   trigcond(oe $cmp$ ref_or_null_helper<ie>)
-                                   
-      the addition is wrapped into trigger only when we want to distinguish
-      between NULL and FALSE results.
+  @param join  Join object of the subquery (i.e. 'child' join).
 
-    - Otherwise (no aggregates/GROUP BY/HAVING) convert it to one of the
-      following:
+  @details
 
-      = If we don't need to distinguish between NULL and FALSE subquery:
-        
-        SELECT 1 FROM ... WHERE (oe $cmp$ ie) AND subq_where
+  Prepare a single-column subquery to be rewritten. Given the subquery.
 
-      = If we need to distinguish between those:
+  If the subquery has no tables it will be turned to an expression between
+  left part and SELECT list.
 
-        SELECT 1 FROM ...
-          WHERE  subq_where AND trigcond((oe $cmp$ ie) OR (ie IS NULL))
-          HAVING trigcond(<is_not_null_test>(ie))
+  In other cases the subquery will be wrapped with  Item_in_optimizer which
+  allow later to turn it to EXISTS or MAX/MIN.
 
-  RETURN
-    RES_OK     Either subquery was transformed, or appopriate
-                       predicates where injected into it.
-    RES_REDUCE The subquery was reduced to non-subquery
-    RES_ERROR  Error
+  @retval false  The subquery was transformed
+  @retval true   Error
 */
 
-Item_subselect::trans_res
-Item_in_subselect::single_value_transformer(JOIN *join,
-					    Comp_creator *func)
+bool
+Item_in_subselect::single_value_transformer(JOIN *join)
 {
   SELECT_LEX *select_lex= join->select_lex;
   DBUG_ENTER("Item_in_subselect::single_value_transformer");
@@ -1343,95 +1505,42 @@ Item_in_subselect::single_value_transformer(JOIN *join,
   if (select_lex->item_list.elements > 1)
   {
     my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
-    DBUG_RETURN(RES_ERROR);
+    DBUG_RETURN(true);
   }
 
-  /*
-    If this is an ALL/ANY single-value subselect, try to rewrite it with
-    a MIN/MAX subselect. We can do that if a possible NULL result of the
-    subselect can be ignored.
-    E.g. SELECT * FROM t1 WHERE b > ANY (SELECT a FROM t2) can be rewritten
-    with SELECT * FROM t1 WHERE b > (SELECT MAX(a) FROM t2).
-    We can't check that this optimization is safe if it's not a top-level
-    item of the WHERE clause (e.g. because the WHERE clause can contain IS
-    NULL/IS NOT NULL functions). If so, we rewrite ALL/ANY with NOT EXISTS
-    later in this method.
-  */
-  if ((abort_on_null || (upper_item && upper_item->top_level())) &&
-      !select_lex->master_unit()->uncacheable && !func->eqne_op())
+  Item* join_having= join->having ? join->having : join->tmp_having;
+  if (!(join_having || select_lex->with_sum_func ||
+        select_lex->group_list.elements) &&
+      select_lex->table_list.elements == 0 &&
+      !select_lex->master_unit()->is_union())
   {
-    if (substitution)
-    {
-      // It is second (third, ...) SELECT of UNION => All is done
-      DBUG_RETURN(RES_OK);
-    }
-
-    Item *subs;
-    if (!select_lex->group_list.elements &&
-        !select_lex->having &&
-	!select_lex->with_sum_func &&
-	!(select_lex->next_select()) &&
-        select_lex->table_list.elements)
-    {
-      Item_sum_hybrid *item;
-      nesting_map save_allow_sum_func;
-      if (func->l_op())
-      {
-	/*
-	  (ALL && (> || =>)) || (ANY && (< || =<))
-	  for ALL condition is inverted
-	*/
-	item= new Item_sum_max(*select_lex->ref_pointer_array);
-      }
-      else
-      {
-	/*
-	  (ALL && (< || =<)) || (ANY && (> || =>))
-	  for ALL condition is inverted
-	*/
-	item= new Item_sum_min(*select_lex->ref_pointer_array);
-      }
-      if (upper_item)
-        upper_item->set_sum_test(item);
-      *select_lex->ref_pointer_array= item;
-      {
-	List_iterator<Item> it(select_lex->item_list);
-	it++;
-	it.replace(item);
-      }
-
-      save_allow_sum_func= thd->lex->allow_sum_func;
-      thd->lex->allow_sum_func|= 1 << thd->lex->current_select->nest_level;
-      /*
-	Item_sum_(max|min) can't substitute other item => we can use 0 as
-        reference, also Item_sum_(max|min) can't be fixed after creation, so
-        we do not check item->fixed
-      */
-      if (item->fix_fields(thd, 0))
-	DBUG_RETURN(RES_ERROR);
-      thd->lex->allow_sum_func= save_allow_sum_func; 
-      /* we added aggregate function => we have to change statistic */
-      count_field_types(select_lex, &join->tmp_table_param, join->all_fields, 
-                        0);
-
-      subs= new Item_singlerow_subselect(select_lex);
-    }
-    else
+    Item *where_item= (Item*) select_lex->item_list.head();
+    /*
+      it is single select without tables => possible optimization
+      remove the dependence mark since the item is moved to upper
+      select and is not outer anymore.
+    */
+    where_item->walk(&Item::remove_dependence_processor, 0,
+                     (uchar *) select_lex->outer_select());
+    substitution= func->create(left_expr, where_item);
+    have_to_be_excluded= 1;
+    if (thd->lex->describe)
     {
-      Item_maxmin_subselect *item;
-      subs= item= new Item_maxmin_subselect(thd, this, select_lex, func->l_op());
-      if (upper_item)
-        upper_item->set_sub_test(item);
+      char warn_buff[MYSQL_ERRMSG_SIZE];
+      sprintf(warn_buff, ER(ER_SELECT_REDUCED), select_lex->select_number);
+      push_warning(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+                   ER_SELECT_REDUCED, warn_buff);
     }
-    /* fix fields is already called for  left expression */
-    substitution= func->create(left_expr, subs);
-    DBUG_RETURN(RES_OK);
+    DBUG_RETURN(false);
   }
 
+  /*
+    Wrap the current IN predicate in an Item_in_optimizer. The actual
+    substitution in the Item tree takes place in Item_subselect::fix_fields.
+  */
   if (!substitution)
   {
     /* We're invoked for the 1st (or the only) SELECT in the subquery UNION */
-    SELECT_LEX_UNIT *master_unit= select_lex->master_unit();
     substitution= optimizer;
 
     SELECT_LEX *current= thd->lex->current_select;
@@ -1441,7 +1550,7 @@ Item_in_subselect::single_value_transformer(JOIN *join,
     if (!optimizer || optimizer->fix_left(thd, 0))
     {
       thd->lex->current_select= current;
-      DBUG_RETURN(RES_ERROR);
+      DBUG_RETURN(true);
     }
     thd->lex->current_select= current;
 
@@ -1457,34 +1566,166 @@ Item_in_subselect::single_value_transformer(JOIN *join,
 			      (char *)"<no matter>",
 			      (char *)in_left_expr_name);
 
-    master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
-    //psergey: placed then removed: select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
   }
 
-  if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
+  DBUG_RETURN(false);
+}
+
+
+/**
+  Apply transformation max/min  transwormation to ALL/ANY subquery if it is
+  possible.
+
+  @param join  Join object of the subquery (i.e. 'child' join).
+
+  @details
+
+  If this is an ALL/ANY single-value subselect, try to rewrite it with
+  a MIN/MAX subselect. We can do that if a possible NULL result of the
+  subselect can be ignored.
+  E.g. SELECT * FROM t1 WHERE b > ANY (SELECT a FROM t2) can be rewritten
+  with SELECT * FROM t1 WHERE b > (SELECT MAX(a) FROM t2).
+  We can't check that this optimization is safe if it's not a top-level
+  item of the WHERE clause (e.g. because the WHERE clause can contain IS
+  NULL/IS NOT NULL functions). If so, we rewrite ALL/ANY with NOT EXISTS
+  later in this method.
+
+  @retval false  The subquery was transformed
+  @retval true   Error
+*/
+
+bool Item_allany_subselect::transform_into_max_min(JOIN *join)
+{
+  DBUG_ENTER("Item_allany_subselect::transform_into_max_min");
+  if (!(in_strategy & (SUBS_MAXMIN_INJECTED | SUBS_MAXMIN_ENGINE)))
+    DBUG_RETURN(false);
+  Item **place= optimizer->arguments() + 1;
+  THD *thd= join->thd;
+  SELECT_LEX *select_lex= join->select_lex;
+  Item *subs;
+
+  /*
+  */
+  DBUG_ASSERT(!substitution);
+
+  if (!select_lex->group_list.elements &&
+      !select_lex->having &&
+      !select_lex->with_sum_func &&
+      !(select_lex->next_select()) &&
+      select_lex->table_list.elements)
   {
-    if (!(pushed_cond_guards= (bool*)join->thd->alloc(sizeof(bool))))
-      DBUG_RETURN(RES_ERROR);
-    pushed_cond_guards[0]= TRUE;
-  }
+    Item_sum_hybrid *item;
+    nesting_map save_allow_sum_func;
+    if (func->l_op())
+    {
+      /*
+        (ALL && (> || =>)) || (ANY && (< || =<))
+        for ALL condition is inverted
+      */
+      item= new Item_sum_max(*select_lex->ref_pointer_array);
+    }
+    else
+    {
+      /*
+        (ALL && (< || =<)) || (ANY && (> || =>))
+        for ALL condition is inverted
+      */
+      item= new Item_sum_min(*select_lex->ref_pointer_array);
+    }
+    if (upper_item)
+      upper_item->set_sum_test(item);
+    thd->change_item_tree(select_lex->ref_pointer_array, item);
+    {
+      List_iterator<Item> it(select_lex->item_list);
+      it++;
+      thd->change_item_tree(it.ref(), item);
+    }
+
+    save_allow_sum_func= thd->lex->allow_sum_func;
+    thd->lex->allow_sum_func|= 1 << thd->lex->current_select->nest_level;
+    /*
+      Item_sum_(max|min) can't substitute other item => we can use 0 as
+      reference, also Item_sum_(max|min) can't be fixed after creation, so
+      we do not check item->fixed
+    */
+    if (item->fix_fields(thd, 0))
+      DBUG_RETURN(true);
+    thd->lex->allow_sum_func= save_allow_sum_func; 
+    /* we added aggregate function => we have to change statistic */
+    count_field_types(select_lex, &join->tmp_table_param, join->all_fields, 
+                      0);
+    if (join->prepare_stage2())
+      DBUG_RETURN(true);
+    subs= new Item_singlerow_subselect(select_lex);
 
+    /*
+      Remove other strategies if any (we already changed the query and
+      can't apply other strategy).
+    */
+    in_strategy= SUBS_MAXMIN_INJECTED;
+  }
+  else
+  {
+    Item_maxmin_subselect *item;
+    subs= item= new Item_maxmin_subselect(thd, this, select_lex, func->l_op());
+    if (upper_item)
+      upper_item->set_sub_test(item);
+    /*
+      Remove other strategies if any (we already changed the query and
+      can't apply other strategy).
+    */
+    in_strategy= SUBS_MAXMIN_ENGINE;
+  }
   /*
-    If this IN predicate can be computed via materialization, do not
-    perform the IN -> EXISTS transformation.
+    The swap is needed for expressions of type 'f1 < ALL ( SELECT ....)'
+    where we want to evaluate the sub query even if f1 would be null.
   */
-  if (exec_method == MATERIALIZATION)
-    DBUG_RETURN(RES_OK);
+  subs= func->create_swap(left_expr, subs);
+  thd->change_item_tree(place, subs);
+  if (subs->fix_fields(thd, &subs))
+    DBUG_RETURN(true);
+  DBUG_ASSERT(subs == (*place)); // There was no substitutions
+
+  select_lex->master_unit()->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
+  select_lex->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
+
+  DBUG_RETURN(false);
+}
+
+
+bool Item_in_subselect::fix_having(Item *having, SELECT_LEX *select_lex)
+{
+  bool fix_res= 0;
+  if (!having->fixed)
+  {
+    select_lex->having_fix_field= 1;
+    fix_res= having->fix_fields(thd, 0);
+    select_lex->having_fix_field= 0;
+  }
+  return fix_res;
+}
 
-  /* Perform the IN=>EXISTS transformation. */
-  DBUG_RETURN(single_value_in_to_exists_transformer(join, func));
+bool Item_allany_subselect::is_maxmin_applicable(JOIN *join)
+{
+  /*
+    Check if max/min optimization applicable: It is top item of
+    WHERE condition.
+  */
+  return (abort_on_null || (upper_item && upper_item->is_top_level_item())) &&
+      !join->select_lex->master_unit()->uncacheable && !func->eqne_op();
 }
 
 
 /**
-  Transofrm an IN predicate into EXISTS via predicate injection.
+  Create the predicates needed to transform a single-column IN/ALL/ANY
+  subselect into a correlated EXISTS via predicate injection.
 
-  @details The transformation injects additional predicates into the subquery
-  (and makes the subquery correlated) as follows.
+  @param join[in]  Join object of the subquery (i.e. 'child' join).
+  @param where_item[out]   the in-to-exists addition to the where clause
+  @param having_item[out]  the in-to-exists addition to the having clause
+
+  @details
+  The correlated predicates are created as follows:
 
   - If the subquery has aggregates, GROUP BY, or HAVING, convert to
 
@@ -1499,34 +1740,38 @@ Item_in_subselect::single_value_transformer(JOIN *join,
 
     = If we don't need to distinguish between NULL and FALSE subquery:
         
-      SELECT 1 FROM ... WHERE (oe $cmp$ ie) AND subq_where
+      SELECT ie FROM ... WHERE subq_where AND (oe $cmp$ ie)
 
     = If we need to distinguish between those:
 
-      SELECT 1 FROM ...
+      SELECT ie FROM ...
         WHERE  subq_where AND trigcond((oe $cmp$ ie) OR (ie IS NULL))
         HAVING trigcond(<is_not_null_test>(ie))
 
-    @param join  Join object of the subquery (i.e. 'child' join).
-    @param func  Subquery comparison creator
-
-    @retval RES_OK     Either subquery was transformed, or appopriate
-                       predicates where injected into it.
-    @retval RES_REDUCE The subquery was reduced to non-subquery
-    @retval RES_ERROR  Error
+  @retval false If the new conditions were created successfully
+  @retval true  Error
 */
 
-Item_subselect::trans_res
-Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creator *func)
+bool
+Item_in_subselect::create_single_in_to_exists_cond(JOIN * join,
+                                                   Item **where_item,
+                                                   Item **having_item)
 {
   SELECT_LEX *select_lex= join->select_lex;
-  DBUG_ENTER("Item_in_subselect::single_value_in_to_exists_transformer");
+  /*
+    The non-transformed HAVING clause of 'join' may be stored in two ways
+    during JOIN::optimize: this->tmp_having= this->having; this->having= 0;
+  */
+  Item* join_having= join->having ? join->having : join->tmp_having;
+
+  DBUG_ENTER("Item_in_subselect::create_single_in_to_exists_cond");
 
-  select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
-  if (join->having || select_lex->with_sum_func ||
+  *where_item= NULL;
+  *having_item= NULL;
+
+  if (join_having || select_lex->with_sum_func ||
       select_lex->group_list.elements)
   {
-    bool tmp;
     Item *item= func->create(expr,
                              new Item_ref_null_helper(&select_lex->context,
                                                       this,
@@ -1542,25 +1787,12 @@ Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creat
       */
       item= new Item_func_trig_cond(item, get_cond_guard(0));
     }
-    
-    /*
-      AND and comparison functions can't be changed during fix_fields()
-      we can assign select_lex->having here, and pass 0 as last
-      argument (reference) to fix_fields()
-    */
-    select_lex->having= join->having= and_items(join->having, item);
-    if (join->having == item)
-      item->name= (char*)in_having_cond;
-    select_lex->having->top_level_item();
-    select_lex->having_fix_field= 1;
-    /*
-      we do not check join->having->fixed, because Item_and (from and_items)
-      or comparison function (from func->create) can't be fixed after creation
-    */
-    tmp= join->having->fix_fields(thd, 0);
-    select_lex->having_fix_field= 0;
-    if (tmp)
-      DBUG_RETURN(RES_ERROR);
+
+    if (!join_having)
+      item->name= (char*) in_having_cond;
+    if (fix_having(item, select_lex))
+      DBUG_RETURN(true);
+    *having_item= item;
   }
   else
   {
@@ -1568,13 +1800,8 @@ Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creat
 
     if (select_lex->table_list.elements)
     {
-      bool tmp;
-      Item *having= item, *orig_item= item;
-      select_lex->item_list.empty();
-      select_lex->item_list.push_back(new Item_int("Not_used",
-                                                   (longlong) 1,
-                                                   MY_INT64_NUM_DECIMAL_DIGITS));
-      select_lex->ref_pointer_array[0]= select_lex->item_list.head();
+      Item *having= item;
+      Item *orig_item= item;
        
       item= func->create(expr, item);
       if (!abort_on_null && orig_item->maybe_null)
@@ -1584,25 +1811,13 @@ Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creat
         {
           if (!(having= new Item_func_trig_cond(having,
                                                 get_cond_guard(0))))
-            DBUG_RETURN(RES_ERROR);
+            DBUG_RETURN(true);
         }
-	/*
-	  Item_is_not_null_test can't be changed during fix_fields()
-	  we can assign select_lex->having here, and pass 0 as last
-	  argument (reference) to fix_fields()
-	*/
-        having->name= (char*)in_having_cond;
-	select_lex->having= join->having= having;
-	select_lex->having_fix_field= 1;
-        /*
-          we do not check join->having->fixed, because Item_and (from
-          and_items) or comparison function (from func->create) can't be
-          fixed after creation
-        */
-	tmp= join->having->fix_fields(thd, 0);
-        select_lex->having_fix_field= 0;
-        if (tmp)
-	  DBUG_RETURN(RES_ERROR);
+        having->name= (char*) in_having_cond;
+        if (fix_having(having, select_lex))
+          DBUG_RETURN(true);
+        *having_item= having;
+
 	item= new Item_cond_or(item,
 			       new Item_func_isnull(orig_item));
       }
@@ -1613,39 +1828,23 @@ Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creat
       if (!abort_on_null && left_expr->maybe_null)
       {
         if (!(item= new Item_func_trig_cond(item, get_cond_guard(0))))
-          DBUG_RETURN(RES_ERROR);
+          DBUG_RETURN(true);
       }
+
       /*
         TODO: figure out why the following is done here in 
         single_value_transformer but there is no corresponding action in
         row_value_transformer?
       */
-      item->name= (char *)in_additional_cond;
-
-      /*
-	AND can't be changed during fix_fields()
-	we can assign select_lex->having here, and pass 0 as last
-	argument (reference) to fix_fields()
-      */
-      select_lex->where= join->conds= and_items(join->conds, item);
-      select_lex->where->top_level_item();
-      /*
-        we do not check join->conds->fixed, because Item_and can't be fixed
-        after creation
-      */
-      if (join->conds->fix_fields(thd, 0))
-	DBUG_RETURN(RES_ERROR);
+      item->name= (char *) in_additional_cond;
+      if (!item->fixed && item->fix_fields(thd, 0))
+        DBUG_RETURN(true);
+      *where_item= item;
     }
     else
     {
-      bool tmp;
       if (select_lex->master_unit()->is_union())
       {
-	/*
-	  comparison functions can't be changed during fix_fields()
-	  we can assign select_lex->having here, and pass 0 as last
-	  argument (reference) to fix_fields()
-	*/
         Item *new_having=
           func->create(expr,
                        new Item_ref_null_helper(&select_lex->context, this,
@@ -1656,49 +1855,40 @@ Item_in_subselect::single_value_in_to_exists_transformer(JOIN * join, Comp_creat
         {
           if (!(new_having= new Item_func_trig_cond(new_having,
                                                     get_cond_guard(0))))
-            DBUG_RETURN(RES_ERROR);
+            DBUG_RETURN(true);
         }
-        new_having->name= (char*)in_having_cond;
-	select_lex->having= join->having= new_having;
-	select_lex->having_fix_field= 1;
-        
-        /*
-          we do not check join->having->fixed, because comparison function
-          (from func->create) can't be fixed after creation
-        */
-	tmp= join->having->fix_fields(thd, 0);
-        select_lex->having_fix_field= 0;
-        if (tmp)
-	  DBUG_RETURN(RES_ERROR);
+
+        new_having->name= (char*) in_having_cond;
+        if (fix_having(new_having, select_lex))
+          DBUG_RETURN(true);
+        *having_item= new_having;
       }
       else
-      {
-	// it is single select without tables => possible optimization
-        // remove the dependence mark since the item is moved to upper
-        // select and is not outer anymore.
-        item->walk(&Item::remove_dependence_processor, 0,
-                           (uchar *) select_lex->outer_select());
-	item= func->create(left_expr, item);
-	// fix_field of item will be done in time of substituting
-	substitution= item;
-	have_to_be_excluded= 1;
-	if (thd->lex->describe)
-	{
-	  char warn_buff[MYSQL_ERRMSG_SIZE];
-	  sprintf(warn_buff, ER(ER_SELECT_REDUCED), select_lex->select_number);
-	  push_warning(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
-		       ER_SELECT_REDUCED, warn_buff);
-	}
-	DBUG_RETURN(RES_REDUCE);
-      }
+        DBUG_ASSERT(false);
     }
   }
 
-  DBUG_RETURN(RES_OK);
+  DBUG_RETURN(false);
 }
 
 
-Item_subselect::trans_res
+/**
+  Wrap a multi-column IN/ALL/ANY subselect into an Item_in_optimizer.
+
+  @param join  Join object of the subquery (i.e. 'child' join).
+
+  @details
+  The subquery predicate is wrapped into an Item_in_optimizer. Later the query
+  optimization phase chooses whether the subquery under the Item_in_optimizer
+  will be further transformed into an equivalent correlated EXISTS by injecting
+  additional predicates, or will be executed via subquery materialization in its
+  unmodified form.
+
+  @retval false  The subquery was transformed
+  @retval true   Error
+*/
+
+bool
 Item_in_subselect::row_value_transformer(JOIN *join)
 {
   SELECT_LEX *select_lex= join->select_lex;
@@ -1710,7 +1900,7 @@ Item_in_subselect::row_value_transformer(JOIN *join)
   if (select_lex->item_list.elements != cols_num)
   {
     my_error(ER_OPERAND_COLUMNS, MYF(0), cols_num);
-    DBUG_RETURN(RES_ERROR);
+    DBUG_RETURN(true);
   }
 
   /*
@@ -1729,94 +1919,113 @@ Item_in_subselect::row_value_transformer(JOIN *join)
     if (!optimizer || optimizer->fix_left(thd, 0))
     {
       thd->lex->current_select= current;
-      DBUG_RETURN(RES_ERROR);
+      DBUG_RETURN(true);
     }
 
     // we will refer to upper level cache array => we have to save it in PS
     optimizer->keep_top_level_cache();
 
     thd->lex->current_select= current;
-    master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
-
-    if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
-    {
-      if (!(pushed_cond_guards= (bool*)join->thd->alloc(sizeof(bool) *
-                                                        left_expr->cols())))
-        DBUG_RETURN(RES_ERROR);
-      for (uint i= 0; i < cols_num; i++)
-        pushed_cond_guards[i]= TRUE;
-    }
+    /*
+      The uncacheable property controls a number of actions, e.g. whether to
+      save/restore (via init_save_join_tab/restore_tmp) the original JOIN for
+      plans with a temp table where the original JOIN was overriden by
+      make_simple_join. The UNCACHEABLE_EXPLAIN is ignored by EXPLAIN, thus
+      non-correlated subqueries will not appear as such to EXPLAIN.
+    */
+    master_unit->uncacheable|= UNCACHEABLE_EXPLAIN;
+    select_lex->uncacheable|= UNCACHEABLE_EXPLAIN;
   }
 
-  /*
-    If this IN predicate can be computed via materialization, do not
-    perform the IN -> EXISTS transformation.
-  */
-  if (exec_method == MATERIALIZATION)
-    DBUG_RETURN(RES_OK);
-
-  /* Perform the IN=>EXISTS transformation. */
-  DBUG_RETURN(row_value_in_to_exists_transformer(join));
+  DBUG_RETURN(false);
 }
 
 
 /**
-  Tranform a (possibly non-correlated) IN subquery into a correlated EXISTS.
+  Create the predicates needed to transform a multi-column IN/ALL/ANY
+  subselect into a correlated EXISTS via predicate injection.
 
-  @todo
-  The IF-ELSE below can be refactored so that there is no duplication of the
-  statements that create the new conditions. For this we have to invert the IF
-  and the FOR statements as this:
-  for (each left operand)
-    create the equi-join condition
-    if (is_having_used || !abort_on_null)
-      create the "is null" and is_not_null_test items
-    if (is_having_used)
-      add the equi-join and the null tests to HAVING
-    else
-      add the equi-join and the "is null" to WHERE
-      add the is_not_null_test to HAVING
+  @details
+  The correlated predicates are created as follows:
+
+  - If the subquery has aggregates, GROUP BY, or HAVING, convert to
+
+    (l1, l2, l3) IN (SELECT v1, v2, v3 ... HAVING having)
+    =>
+    EXISTS (SELECT ... HAVING having and
+                              (l1 = v1 or is null v1) and
+                              (l2 = v2 or is null v2) and
+                              (l3 = v3 or is null v3) and
+                              is_not_null_test(v1) and
+                              is_not_null_test(v2) and
+                              is_not_null_test(v3))
+
+    where is_not_null_test used to register nulls in case if we have
+    not found matching to return correct NULL value.
+
+  - Otherwise (no aggregates/GROUP BY/HAVING) convert the subquery as follows:
+
+    (l1, l2, l3) IN (SELECT v1, v2, v3 ... WHERE where)
+    =>
+    EXISTS (SELECT ... WHERE where and
+                             (l1 = v1 or is null v1) and
+                             (l2 = v2 or is null v2) and
+                             (l3 = v3 or is null v3)
+                       HAVING is_not_null_test(v1) and
+                              is_not_null_test(v2) and
+                              is_not_null_test(v3))
+    where is_not_null_test registers NULLs values but reject rows.
+
+    in case when we do not need correct NULL, we have simplier construction:
+    EXISTS (SELECT ... WHERE where and
+                             (l1 = v1) and
+                             (l2 = v2) and
+                             (l3 = v3)
+
+  @param join[in]  Join object of the subquery (i.e. 'child' join).
+  @param where_item[out]   the in-to-exists addition to the where clause
+  @param having_item[out]  the in-to-exists addition to the having clause
+
+  @retval false  If the new conditions were created successfully
+  @retval true   Error
 */
 
-Item_subselect::trans_res
-Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
+bool
+Item_in_subselect::create_row_in_to_exists_cond(JOIN * join,
+                                                Item **where_item,
+                                                Item **having_item)
 {
   SELECT_LEX *select_lex= join->select_lex;
-  Item *having_item= 0;
   uint cols_num= left_expr->cols();
-  bool is_having_used= (join->having || select_lex->with_sum_func ||
+  /*
+    The non-transformed HAVING clause of 'join' may be stored in two ways
+    during JOIN::optimize: this->tmp_having= this->having; this->having= 0;
+  */
+  Item* join_having= join->having ? join->having : join->tmp_having;
+  bool is_having_used= (join_having || select_lex->with_sum_func ||
                         select_lex->group_list.first ||
                         !select_lex->table_list.elements);
 
-  DBUG_ENTER("Item_in_subselect::row_value_in_to_exists_transformer");
+  DBUG_ENTER("Item_in_subselect::create_row_in_to_exists_cond");
+
+  *where_item= NULL;
+  *having_item= NULL;
 
-  select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
   if (is_having_used)
   {
-    /*
-      (l1, l2, l3) IN (SELECT v1, v2, v3 ... HAVING having) =>
-      EXISTS (SELECT ... HAVING having and
-                                (l1 = v1 or is null v1) and
-                                (l2 = v2 or is null v2) and
-                                (l3 = v3 or is null v3) and
-                                is_not_null_test(v1) and
-                                is_not_null_test(v2) and
-                                is_not_null_test(v3))
-      where is_not_null_test used to register nulls in case if we have
-      not found matching to return correct NULL value
-      TODO: say here explicitly if the order of AND parts matters or not.
-    */
+    /* TODO: say here explicitly if the order of AND parts matters or not. */
     Item *item_having_part2= 0;
     for (uint i= 0; i < cols_num; i++)
     {
       DBUG_ASSERT((left_expr->fixed &&
+
                   select_lex->ref_pointer_array[i]->fixed) ||
                   (select_lex->ref_pointer_array[i]->type() == REF_ITEM &&
                    ((Item_ref*)(select_lex->ref_pointer_array[i]))->ref_type() ==
                     Item_ref::OUTER_REF));
       if (select_lex->ref_pointer_array[i]->
           check_cols(left_expr->element_index(i)->cols()))
-        DBUG_RETURN(RES_ERROR);
+        DBUG_RETURN(true);
       Item *item_eq=
         new Item_func_eq(new
                          Item_ref(&select_lex->context,
@@ -1828,23 +2037,21 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
                          Item_ref(&select_lex->context,
                                   select_lex->ref_pointer_array + i,
                                   (char *)"<no matter>",
-                                  (char *)"<list ref>")
-                        );
+                                  (char *)"<list ref>"));
       Item *item_isnull=
         new Item_func_isnull(new
                              Item_ref(&select_lex->context,
                                       select_lex->ref_pointer_array+i,
                                       (char *)"<no matter>",
-                                      (char *)"<list ref>")
-                            );
+                                      (char *)"<list ref>"));
       Item *col_item= new Item_cond_or(item_eq, item_isnull);
       if (!abort_on_null && left_expr->element_index(i)->maybe_null)
       {
         if (!(col_item= new Item_func_trig_cond(col_item, get_cond_guard(i))))
-          DBUG_RETURN(RES_ERROR);
+          DBUG_RETURN(true);
       }
-      having_item= and_items(having_item, col_item);
-      
+      *having_item= and_items(*having_item, col_item);
+
       Item *item_nnull_test= 
          new Item_is_not_null_test(this,
                                    new Item_ref(&select_lex->context,
@@ -1856,34 +2063,15 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
       {
         if (!(item_nnull_test= 
               new Item_func_trig_cond(item_nnull_test, get_cond_guard(i))))
-          DBUG_RETURN(RES_ERROR);
+          DBUG_RETURN(true);
       }
       item_having_part2= and_items(item_having_part2, item_nnull_test);
       item_having_part2->top_level_item();
     }
-    having_item= and_items(having_item, item_having_part2);
-    having_item->top_level_item();
+    *having_item= and_items(*having_item, item_having_part2);
   }
   else
   {
-    /*
-      (l1, l2, l3) IN (SELECT v1, v2, v3 ... WHERE where) =>
-      EXISTS (SELECT ... WHERE where and
-                               (l1 = v1 or is null v1) and
-                               (l2 = v2 or is null v2) and
-                               (l3 = v3 or is null v3)
-                         HAVING is_not_null_test(v1) and
-                                is_not_null_test(v2) and
-                                is_not_null_test(v3))
-      where is_not_null_test register NULLs values but reject rows
-
-      in case when we do not need correct NULL, we have simplier construction:
-      EXISTS (SELECT ... WHERE where and
-                               (l1 = v1) and
-                               (l2 = v2) and
-                               (l3 = v3)
-    */
-    Item *where_item= 0;
     for (uint i= 0; i < cols_num; i++)
     {
       Item *item, *item_isnull;
@@ -1894,7 +2082,7 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
                     Item_ref::OUTER_REF));
       if (select_lex->ref_pointer_array[i]->
           check_cols(left_expr->element_index(i)->cols()))
-        DBUG_RETURN(RES_ERROR);
+        DBUG_RETURN(true);
       item=
         new Item_func_eq(new
                          Item_direct_ref(&select_lex->context,
@@ -1907,8 +2095,7 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
                                          select_lex->
                                          ref_pointer_array+i,
                                          (char *)"<no matter>",
-                                         (char *)"<list ref>")
-                        );
+                                         (char *)"<list ref>"));
       if (!abort_on_null)
       {
         Item *having_col_item=
@@ -1926,8 +2113,7 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
                                            select_lex->
                                            ref_pointer_array+i,
                                            (char *)"<no matter>",
-                                           (char *)"<list ref>")
-                          );
+                                           (char *)"<list ref>"));
         item= new Item_cond_or(item, item_isnull);
         /* 
           TODO: why we create the above for cases where the right part
@@ -1936,104 +2122,211 @@ Item_in_subselect::row_value_in_to_exists_transformer(JOIN * join)
         if (left_expr->element_index(i)->maybe_null)
         {
           if (!(item= new Item_func_trig_cond(item, get_cond_guard(i))))
-            DBUG_RETURN(RES_ERROR);
+            DBUG_RETURN(true);
           if (!(having_col_item= 
                   new Item_func_trig_cond(having_col_item, get_cond_guard(i))))
-            DBUG_RETURN(RES_ERROR);
+            DBUG_RETURN(true);
         }
-        having_item= and_items(having_item, having_col_item);
+        *having_item= and_items(*having_item, having_col_item);
       }
-      where_item= and_items(where_item, item);
+      *where_item= and_items(*where_item, item);
     }
-    /*
-      AND can't be changed during fix_fields()
-      we can assign select_lex->where here, and pass 0 as last
-      argument (reference) to fix_fields()
-    */
-    select_lex->where= join->conds= and_items(join->conds, where_item);
-    select_lex->where->top_level_item();
-    if (join->conds->fix_fields(thd, 0))
-      DBUG_RETURN(RES_ERROR);
   }
-  if (having_item)
+
+  if (*where_item)
   {
-    bool res;
-    select_lex->having= join->having= and_items(join->having, having_item);
-    if (having_item == select_lex->having)
-      having_item->name= (char*)in_having_cond;
-    select_lex->having->top_level_item();
-    /*
-      AND can't be changed during fix_fields()
-      we can assign select_lex->having here, and pass 0 as last
-      argument (reference) to fix_fields()
-    */
-    select_lex->having_fix_field= 1;
-    res= join->having->fix_fields(thd, 0);
-    select_lex->having_fix_field= 0;
-    if (res)
-    {
-      DBUG_RETURN(RES_ERROR);
-    }
+    if (!(*where_item)->fixed && (*where_item)->fix_fields(thd, 0))
+      DBUG_RETURN(true);
+    (*where_item)->top_level_item();
+  }
+
+  if (*having_item)
+  {
+    if (!join_having)
+      (*having_item)->name= (char*) in_having_cond;
+    if (fix_having(*having_item, select_lex))
+      DBUG_RETURN(true);
+    (*having_item)->top_level_item();
   }
 
-  DBUG_RETURN(RES_OK);
+  DBUG_RETURN(false);
 }
 
 
-Item_subselect::trans_res
+bool
 Item_in_subselect::select_transformer(JOIN *join)
 {
-  return select_in_like_transformer(join, &eq_creator);
+  return select_in_like_transformer(join);
 }
 
 
 /**
-  Prepare IN/ALL/ANY/SOME subquery transformation and call appropriate
-  transformation function.
+  Create the predicates needed to transform an IN/ALL/ANY subselect into a
+  correlated EXISTS via predicate injection.
 
-    To decide which transformation procedure (scalar or row) applicable here
-    we have to call fix_fields() for left expression to be able to call
-    cols() method on it. Also this method make arena management for
-    underlying transformation methods.
+  @param join_arg  Join object of the subquery.
+
+  @retval FALSE  ok
+  @retval TRUE   error
+*/
+
+bool Item_in_subselect::create_in_to_exists_cond(JOIN *join_arg)
+{
+  bool res;
+
+  DBUG_ASSERT(engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE ||
+              engine->engine_type() == subselect_engine::UNION_ENGINE);
+  /*
+    TODO: the call to init_cond_guards allocates and initializes an
+    array of booleans that may not be used later because we may choose
+    materialization.
+    The two calls below to create_XYZ_cond depend on this boolean array.
+    If the dependency is removed, the call can be moved to a later phase.
+  */
+  init_cond_guards();
+  if (left_expr->cols() == 1)
+    res= create_single_in_to_exists_cond(join_arg,
+                                         &(join_arg->in_to_exists_where),
+                                         &(join_arg->in_to_exists_having));
+  else
+    res= create_row_in_to_exists_cond(join_arg,
+                                      &(join_arg->in_to_exists_where),
+                                      &(join_arg->in_to_exists_having));
+
+  /*
+    The IN=>EXISTS transformation makes non-correlated subqueries correlated.
+  */
+  join_arg->select_lex->uncacheable|= UNCACHEABLE_DEPENDENT_INJECTED;
+  /*
+    The uncacheable property controls a number of actions, e.g. whether to
+    save/restore (via init_save_join_tab/restore_tmp) the original JOIN for
+    plans with a temp table where the original JOIN was overriden by
+    make_simple_join. The UNCACHEABLE_EXPLAIN is ignored by EXPLAIN, thus
+    non-correlated subqueries will not appear as such to EXPLAIN.
+  */
+  join_arg->select_lex->master_unit()->uncacheable|= UNCACHEABLE_EXPLAIN;
+  join_arg->select_lex->uncacheable|= UNCACHEABLE_EXPLAIN;
+  return (res);
+}
+
+
+/**
+  Transform an IN/ALL/ANY subselect into a correlated EXISTS via injecting
+  correlated in-to-exists predicates.
+
+  @param join_arg  Join object of the subquery.
+
+  @retval FALSE  ok
+  @retval TRUE   error
+*/
+
+bool Item_in_subselect::inject_in_to_exists_cond(JOIN *join_arg)
+{
+  SELECT_LEX *select_lex= join_arg->select_lex;
+  Item *where_item= join_arg->in_to_exists_where;
+  Item *having_item= join_arg->in_to_exists_having;
+
+  DBUG_ENTER("Item_in_subselect::inject_in_to_exists_cond");
+
+  if (where_item)
+  {
+    List<Item> *and_args= NULL;
+    /*
+      If the top-level Item of the WHERE clause is an AND, detach the multiple
+      equality list that was attached to the end of the AND argument list by
+      build_equal_items_for_cond(). The multiple equalities must be detached
+      because fix_fields merges lower level AND arguments into the upper AND.
+      As a result, the arguments from lower-level ANDs are concatenated after
+      the multiple equalities. When the multiple equality list is treated as
+      such, it turns out that it contains non-Item_equal object which is wrong.
+    */
+    if (join_arg->conds && join_arg->conds->type() == Item::COND_ITEM &&
+        ((Item_cond*) join_arg->conds)->functype() == Item_func::COND_AND_FUNC)
+    {
+      and_args= ((Item_cond*) join_arg->conds)->argument_list();
+      if (join_arg->cond_equal)
+        and_args->disjoin((List<Item> *) &join_arg->cond_equal->current_level);
+    }
+
+    where_item= and_items(join_arg->conds, where_item);
+    if (!where_item->fixed && where_item->fix_fields(thd, 0))
+      DBUG_RETURN(true);
+    // TIMOUR TODO: call optimize_cond() for the new where clause
+    thd->change_item_tree(&select_lex->where, where_item);
+    select_lex->where->top_level_item();
+    join_arg->conds= select_lex->where;
+
+    /* Attach back the list of multiple equalities to the new top-level AND. */
+    if (and_args && join_arg->cond_equal)
+    {
+      /* The argument list of the top-level AND may change after fix fields. */
+      and_args= ((Item_cond*) join_arg->conds)->argument_list();
+      and_args->concat((List<Item> *) &join_arg->cond_equal->current_level);
+    }
+  }
+
+  if (having_item)
+  {
+    Item* join_having= join_arg->having ? join_arg->having:join_arg->tmp_having;
+    having_item= and_items(join_having, having_item);
+    if (fix_having(having_item, select_lex))
+      DBUG_RETURN(true);
+    // TIMOUR TODO: call optimize_cond() for the new having clause
+    thd->change_item_tree(&select_lex->having, having_item);
+    select_lex->having->top_level_item();
+    join_arg->having= select_lex->having;
+  }
+  join_arg->thd->change_item_tree(&unit->global_parameters->select_limit,
+                                  new Item_int((int32) 1));
+  unit->select_limit_cnt= 1;
+
+  DBUG_RETURN(false);
+}
+
+
+/**
+  Prepare IN/ALL/ANY/SOME subquery transformation and call the appropriate
+  transformation function.
 
   @param join    JOIN object of transforming subquery
-  @param func    creator of condition function of subquery
 
-  @retval
-    RES_OK      OK
-  @retval
-    RES_REDUCE  OK, and current subquery was reduced during
-    transformation
-  @retval
-    RES_ERROR   Error
+  @notes
+  To decide which transformation procedure (scalar or row) applicable here
+  we have to call fix_fields() for the left expression to be able to call
+  cols() method on it. Also this method makes arena management for
+  underlying transformation methods.
+
+  @retval  false  OK
+  @retval  true   Error
 */
 
-Item_subselect::trans_res
-Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
+bool
+Item_in_subselect::select_in_like_transformer(JOIN *join)
 {
   Query_arena *arena, backup;
   SELECT_LEX *current= thd->lex->current_select;
   const char *save_where= thd->where;
-  Item_subselect::trans_res res= RES_ERROR;
+  bool trans_res= true;
   bool result;
 
   DBUG_ENTER("Item_in_subselect::select_in_like_transformer");
 
+  /*
+    IN/SOME/ALL/ANY subqueries aren't support LIMIT clause. Without it
+    ORDER BY clause becomes meaningless thus we drop it here.
+  */
+  for (SELECT_LEX *sl= current->master_unit()->first_select();
+       sl; sl= sl->next_select())
   {
-    /*
-      IN/SOME/ALL/ANY subqueries aren't support LIMIT clause. Without it
-      ORDER BY clause becomes meaningless thus we drop it here.
-    */
-    SELECT_LEX *sl= current->master_unit()->first_select();
-    for (; sl; sl= sl->next_select())
+    if (sl->join)
     {
-      if (sl->join)
-        sl->join->order= 0;
+      sl->join->order= 0;
+      sl->join->skip_sort_order= 1;
     }
   }
 
   if (changed)
-    DBUG_RETURN(RES_OK);
+    DBUG_RETURN(false);
 
   thd->where= "IN/ALL/ANY subquery";
 
@@ -2065,22 +2358,15 @@ Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
     goto err;
 
   /*
-    If we didn't choose an execution method up to this point, we choose
-    the IN=>EXISTS transformation.
-  */
-  if (exec_method == NOT_TRANSFORMED)
-    exec_method= IN_TO_EXISTS;
-  arena= thd->activate_stmt_arena_if_needed(&backup);
-
-  /*
     Both transformers call fix_fields() only for Items created inside them,
     and all that items do not make permanent changes in current item arena
     which allow to us call them with changed arena (if we do not know nature
     of Item, we have to call fix_fields() for it only with original arena to
     avoid memory leack)
   */
+  arena= thd->activate_stmt_arena_if_needed(&backup);
   if (left_expr->cols() == 1)
-    res= single_value_transformer(join, func);
+    trans_res= single_value_transformer(join);
   else
   {
     /* we do not support row operation for ALL/ANY/SOME */
@@ -2089,21 +2375,21 @@ Item_in_subselect::select_in_like_transformer(JOIN *join, Comp_creator *func)
       if (arena)
         thd->restore_active_arena(arena, &backup);
       my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
-      DBUG_RETURN(RES_ERROR);
+      DBUG_RETURN(true);
     }
-    res= row_value_transformer(join);
+    trans_res= row_value_transformer(join);
   }
   if (arena)
     thd->restore_active_arena(arena, &backup);
 err:
   thd->where= save_where;
-  DBUG_RETURN(res);
+  DBUG_RETURN(trans_res);
 }
 
 
 void Item_in_subselect::print(String *str, enum_query_type query_type)
 {
-  if (exec_method == IN_TO_EXISTS)
+  if (in_strategy & SUBS_IN_TO_EXISTS)
     str->append(STRING_WITH_LEN("<exists>"));
   else
   {
@@ -2119,7 +2405,7 @@ bool Item_in_subselect::fix_fields(THD *thd_arg, Item **ref)
   uint outer_cols_num;
   List<Item> *inner_cols;
 
-  if (exec_method == SEMI_JOIN)
+  if (in_strategy & SUBS_SEMI_JOIN)
     return !( (*ref)= new Item_int(1));
 
   /*
@@ -2175,7 +2461,6 @@ bool Item_in_subselect::fix_fields(THD *thd_arg, Item **ref)
     return TRUE;
 
   fixed= TRUE;
-
   return FALSE;
 }
 
@@ -2193,99 +2478,48 @@ void Item_in_subselect::update_used_tables()
   used_tables_cache |= left_expr->used_tables();
 }
 
+
 /**
-  Try to create an engine to compute the subselect via materialization,
-  and if this fails, revert to execution via the IN=>EXISTS transformation.
+  Try to create and initialize an engine to compute a subselect via
+  materialization.
 
   @details
-    The purpose of this method is to hide the implementation details
-    of this Item's execution. The method creates a new engine for
-    materialized execution, and initializes the engine.
-
-    If this initialization fails
-    - either because it wasn't possible to create the needed temporary table
-      and its index,
-    - or because of a memory allocation error,
-    then we revert back to execution via the IN=>EXISTS tranformation.
-
-    The initialization of the new engine is divided in two parts - a permanent
-    one that lives across prepared statements, and one that is repeated for each
-    execution.
+  The method creates a new engine for materialized execution, and initializes
+  the engine. The initialization may fail
+  - either because it wasn't possible to create the needed temporary table
+    and its index,
+  - or because of a memory allocation error,
 
   @returns
     @retval TRUE  memory allocation error occurred
     @retval FALSE an execution method was chosen successfully
 */
 
-bool Item_in_subselect::setup_engine()
+bool Item_in_subselect::setup_mat_engine()
 {
-  subselect_hash_sj_engine *new_engine= NULL;
-  bool res= FALSE;
-
-  DBUG_ENTER("Item_in_subselect::setup_engine");
+  subselect_hash_sj_engine       *mat_engine= NULL;
+  subselect_single_select_engine *select_engine;
 
-  if (engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE)
-  {
-    /* Create/initialize objects in permanent memory. */
-    subselect_single_select_engine *old_engine;
-    Query_arena *arena= thd->stmt_arena, backup;
-
-    old_engine= (subselect_single_select_engine*) engine;
-
-    if (arena->is_conventional())
-      arena= 0;
-    else
-      thd->set_n_backup_active_arena(arena, &backup);
+  DBUG_ENTER("Item_in_subselect::setup_mat_engine");
 
-    if (!(new_engine= new subselect_hash_sj_engine(thd, this,
-                                                   old_engine)) ||
-        new_engine->init_permanent(unit->get_unit_column_types()))
-    {
-      Item_subselect::trans_res trans_res;
-      /*
-        If for some reason we cannot use materialization for this IN predicate,
-        delete all materialization-related objects, and apply the IN=>EXISTS
-        transformation.
-      */
-      delete new_engine;
-      new_engine= NULL;
-      exec_method= NOT_TRANSFORMED;
-      if (left_expr->cols() == 1)
-        trans_res= single_value_in_to_exists_transformer(old_engine->join,
-                                                         &eq_creator);
-      else
-        trans_res= row_value_in_to_exists_transformer(old_engine->join);
-      res= (trans_res != Item_subselect::RES_OK);
-    }
-    if (new_engine)
-      engine= new_engine;
+  /*
+    The select_engine (that executes transformed IN=>EXISTS subselects) is
+    pre-created at parse time, and is stored in statment memory (preserved
+    across PS executions).
+  */
+  DBUG_ASSERT(engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE);
+  select_engine= (subselect_single_select_engine*) engine;
 
-    if (arena)
-      thd->restore_active_arena(arena, &backup);
-  }
-  else
-  {
-    DBUG_ASSERT(engine->engine_type() == subselect_engine::HASH_SJ_ENGINE);
-    new_engine= (subselect_hash_sj_engine*) engine;
-  }
+  /* Create/initialize execution objects. */
+  if (!(mat_engine= new subselect_hash_sj_engine(thd, this, select_engine)))
+    DBUG_RETURN(TRUE);
 
-  /* Initilizations done in runtime memory, repeated for each execution. */
-  if (new_engine)
-  {
-    /*
-      Reset the LIMIT 1 set in Item_exists_subselect::fix_length_and_dec.
-      TODO:
-      Currently we set the subquery LIMIT to infinity, and this is correct
-      because we forbid at parse time LIMIT inside IN subqueries (see
-      Item_in_subselect::test_limit). However, once we allow this, here
-      we should set the correct limit if given in the query.
-    */
-    unit->global_parameters->select_limit= NULL;
-    if ((res= new_engine->init_runtime()))
-      DBUG_RETURN(res);
-  }
+  if (mat_engine->init(&select_engine->join->fields_list,
+                       engine->get_identifier()))
+    DBUG_RETURN(TRUE);
 
-  DBUG_RETURN(res);
+  engine= mat_engine;
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -2310,7 +2544,7 @@ bool Item_in_subselect::init_left_expr_cache()
     An IN predicate might be evaluated in a query for which all tables have
     been optimzied away.
   */ 
-  if (!outer_join || !outer_join->tables || !outer_join->tables_list)
+  if (!outer_join || !outer_join->table_count || !outer_join->tables_list)
     return TRUE;
 
   if (!(left_expr_cache= new List<Cached_item>))
@@ -2328,39 +2562,36 @@ bool Item_in_subselect::init_left_expr_cache()
 }
 
 
-/*
-  Callback to test if an IN predicate is expensive.
-
-  @details
-    IN predicates are considered expensive only if they will be executed via
-    materialization. The return value affects the behavior of
-    make_cond_for_table() in such a way that it is unchanged when we use
-    the IN=>EXISTS transformation to compute IN.
-
-  @retval TRUE  if the predicate is expensive
-  @retval FALSE otherwise
-*/
-
-bool Item_in_subselect::is_expensive_processor(uchar *arg)
+bool Item_in_subselect::init_cond_guards()
 {
-  return exec_method == MATERIALIZATION;
+  uint cols_num= left_expr->cols();
+  if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
+  {
+    if (!(pushed_cond_guards= (bool*)thd->alloc(sizeof(bool) * cols_num)))
+        return TRUE;
+    for (uint i= 0; i < cols_num; i++)
+      pushed_cond_guards[i]= TRUE;
+  }
+  return FALSE;
 }
 
 
-Item_subselect::trans_res
+bool
 Item_allany_subselect::select_transformer(JOIN *join)
 {
   DBUG_ENTER("Item_allany_subselect::select_transformer");
-  exec_method= IN_TO_EXISTS;
+  DBUG_ASSERT((in_strategy & ~(SUBS_MAXMIN_INJECTED | SUBS_MAXMIN_ENGINE |
+                               SUBS_IN_TO_EXISTS)) == 0);
+  in_strategy|= SUBS_IN_TO_EXISTS;
   if (upper_item)
     upper_item->show= 1;
-  DBUG_RETURN(select_in_like_transformer(join, func));
+  DBUG_RETURN(select_in_like_transformer(join));
 }
 
 
 void Item_allany_subselect::print(String *str, enum_query_type query_type)
 {
-  if (exec_method == IN_TO_EXISTS)
+  if (in_strategy & SUBS_IN_TO_EXISTS)
     str->append(STRING_WITH_LEN("<exists>"));
   else
   {
@@ -2382,10 +2613,10 @@ void subselect_engine::set_thd(THD *thd_arg)
 
 
 subselect_single_select_engine::
-subselect_single_select_engine(st_select_lex *select,
+subselect_single_select_engine(THD *thd_arg, st_select_lex *select,
 			       select_result_interceptor *result_arg,
 			       Item_subselect *item_arg)
-  :subselect_engine(item_arg, result_arg),
+  :subselect_engine(thd_arg, item_arg, result_arg),
    prepared(0), executed(0), select_lex(select), join(0)
 {
   select_lex->master_unit()->item= item_arg;
@@ -2402,6 +2633,7 @@ void subselect_single_select_engine::cleanup()
   prepared= executed= 0;
   join= 0;
   result->cleanup();
+  select_lex->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
   DBUG_VOID_RETURN;
 }
 
@@ -2411,6 +2643,9 @@ void subselect_union_engine::cleanup()
   DBUG_ENTER("subselect_union_engine::cleanup");
   unit->reinit_exec_mechanism();
   result->cleanup();
+  unit->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
+  for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select())
+    sl->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
   DBUG_VOID_RETURN;
 }
 
@@ -2453,10 +2688,10 @@ void subselect_uniquesubquery_engine::cleanup()
 }
 
 
-subselect_union_engine::subselect_union_engine(st_select_lex_unit *u,
+subselect_union_engine::subselect_union_engine(THD *thd_arg, st_select_lex_unit *u,
 					       select_result_interceptor *result_arg,
 					       Item_subselect *item_arg)
-  :subselect_engine(item_arg, result_arg)
+  :subselect_engine(thd_arg, item_arg, result_arg)
 {
   unit= u;
   unit->item= item_arg;
@@ -2691,9 +2926,9 @@ int subselect_single_select_engine::exec()
         pushed down into the subquery. Those optimizations are ref[_or_null]
         acceses. Change them to be full table scans.
       */
-      for (uint i=join->const_tables ; i < join->tables ; i++)
+      for (JOIN_TAB *tab= first_linear_tab(join, WITHOUT_CONST_TABLES); tab;
+           tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
       {
-        JOIN_TAB *tab=join->join_tab+i;
         if (tab && tab->keyuse)
         {
           for (uint i= 0; i < tab->ref.key_parts; i++)
@@ -3016,6 +3251,9 @@ int subselect_uniquesubquery_engine::exec()
     DBUG_RETURN(0);
   }
 
+  if (!tab->preread_init_done && tab->preread_init())
+    DBUG_RETURN(1);
+
   if (null_keypart)
     DBUG_RETURN(scan_table());
  
@@ -3148,7 +3386,7 @@ subselect_uniquesubquery_engine::~subselect_uniquesubquery_engine()
 
 int subselect_indexsubquery_engine::exec()
 {
-  DBUG_ENTER("subselect_indexsubquery_engine::exec");
+  DBUG_ENTER("subselect_indexsubquery_engine");
   int error;
   bool null_finding= 0;
   TABLE *table= tab->table;
@@ -3179,6 +3417,9 @@ int subselect_indexsubquery_engine::exec()
     DBUG_RETURN(0);
   }
 
+  if (!tab->preread_init_done && tab->preread_init())
+    DBUG_RETURN(1);
+
   if (null_keypart)
     DBUG_RETURN(scan_table());
 
@@ -3280,10 +3521,13 @@ void subselect_uniquesubquery_engine::exclude()
 }
 
 
-table_map subselect_engine::calc_const_tables(TABLE_LIST *table)
+table_map subselect_engine::calc_const_tables(List<TABLE_LIST> &list)
 {
   table_map map= 0;
-  for (; table; table= table->next_leaf)
+  List_iterator<TABLE_LIST> ti(list);
+  TABLE_LIST *table;
+  //for (; table; table= table->next_leaf)
+  while ((table= ti++))
   {
     TABLE *tbl= table->table;
     if (tbl && tbl->const_table)
@@ -3400,6 +3644,7 @@ void subselect_indexsubquery_engine::print(String *str,
 
   @param si		new subselect Item
   @param res		new select_result object
+  @param temp           temporary assignment
 
   @retval
     FALSE OK
@@ -3407,12 +3652,32 @@ void subselect_indexsubquery_engine::print(String *str,
     TRUE  error
 */
 
-bool subselect_single_select_engine::change_result(Item_subselect *si,
-                                                 select_result_interceptor *res)
+bool
+subselect_single_select_engine::change_result(Item_subselect *si,
+                                              select_result_interceptor *res,
+                                              bool temp)
 {
   item= si;
-  result= res;
-  return select_lex->join->change_result(result);
+  if (temp)
+  {
+    /*
+      Here we reuse change_item_tree to roll back assignment.  It has
+      nothing special about Item* pointer so it is safe conversion. We do
+      not change the interface to be compatible with MySQL.
+    */
+    thd->change_item_tree((Item**) &result, (Item*)res);
+  }
+  else
+    result= res;
+
+  /*
+    We can't use 'result' below as gcc 4.2.4's alias optimization
+    assumes that result was not changed by thd->change_item_tree().
+    I tried to find a solution to make gcc happy, but could not find anything
+    that would not require a lot of extra code that would be harder to manage
+    than the current code.
+  */
+  return select_lex->join->change_result(res);
 }
 
 
@@ -3429,11 +3694,15 @@ bool subselect_single_select_engine::change_result(Item_subselect *si,
 */
 
 bool subselect_union_engine::change_result(Item_subselect *si,
-                                           select_result_interceptor *res)
+                                           select_result_interceptor *res,
+                                           bool temp)
 {
   item= si;
   int rc= unit->change_result(res, result);
-  result= res;
+  if (temp)
+    thd->change_item_tree((Item**) &result, (Item*)res);
+  else
+    result= res;
   return rc;
 }
 
@@ -3450,8 +3719,11 @@ bool subselect_union_engine::change_result(Item_subselect *si,
     TRUE  error
 */
 
-bool subselect_uniquesubquery_engine::change_result(Item_subselect *si,
-                                                    select_result_interceptor *res)
+bool
+subselect_uniquesubquery_engine::change_result(Item_subselect *si,
+                                               select_result_interceptor *res,
+                                               bool temp
+                                               __attribute__((unused)))
 {
   DBUG_ASSERT(0);
   return TRUE;
@@ -3631,8 +3903,7 @@ subselect_hash_sj_engine::get_strategy_using_data()
       bitmap_set_bit(&non_null_key_parts, i);
       --count_partial_match_columns;
     }
-    if (result_sink->get_null_count_of_col(i) ==
-               tmp_table->file->stats.records)
+    if (result_sink->get_null_count_of_col(i) == tmp_table->file->stats.records)
       ++count_null_only_columns;
   }
 
@@ -3648,7 +3919,7 @@ subselect_hash_sj_engine::choose_partial_match_strategy(
   bool has_non_null_key, bool has_covering_null_row,
   MY_BITMAP *partial_match_key_parts)
 {
-  size_t pm_buff_size;
+  ulonglong pm_buff_size;
 
   DBUG_ASSERT(strategy == PARTIAL_MATCH);
   /*
@@ -3713,11 +3984,12 @@ subselect_hash_sj_engine::choose_partial_match_strategy(
   matching via merging is not applicable.
 */
 
-size_t subselect_hash_sj_engine::rowid_merge_buff_size(
+ulonglong subselect_hash_sj_engine::rowid_merge_buff_size(
   bool has_non_null_key, bool has_covering_null_row,
   MY_BITMAP *partial_match_key_parts)
 {
-  size_t buff_size; /* Total size of all buffers used by partial matching. */
+  /* Total size of all buffers used by partial matching. */
+  ulonglong buff_size;
   ha_rows row_count= tmp_table->file->stats.records;
   uint rowid_length= tmp_table->file->ref_length;
   select_materialize_with_stats *result_sink=
@@ -3777,6 +4049,8 @@ bitmap_init_memroot(MY_BITMAP *map, uint n_bits, MEM_ROOT *mem_root)
   reexecution.
 
   @param tmp_columns the items that produce the data for the temp table
+  @param subquery_id subquery's identifier (to make "<subquery%d>" name for
+                                            EXPLAIN)
 
   @details
   - Create a temporary table to store the result of the IN subquery. The
@@ -3792,13 +4066,14 @@ bitmap_init_memroot(MY_BITMAP *map, uint n_bits, MEM_ROOT *mem_root)
   @retval FALSE otherwise
 */
 
-bool subselect_hash_sj_engine::init_permanent(List<Item> *tmp_columns)
+bool subselect_hash_sj_engine::init(List<Item> *tmp_columns, uint subquery_id)
 {
+  select_union *result_sink;
   /* Options to create_tmp_table. */
   ulonglong tmp_create_options= thd->variables.option_bits | TMP_TABLE_ALL_COLUMNS;
                              /* | TMP_TABLE_FORCE_MYISAM; TIMOUR: force MYISAM */
 
-  DBUG_ENTER("subselect_hash_sj_engine::init_permanent");
+  DBUG_ENTER("subselect_hash_sj_engine::init");
 
   if (bitmap_init_memroot(&non_null_key_parts, tmp_columns->elements,
                             thd->mem_root) ||
@@ -3827,15 +4102,24 @@ bool subselect_hash_sj_engine::init_permanent(List<Item> *tmp_columns)
     DBUG_RETURN(TRUE);
   }
 */
-  if (!(result= new select_materialize_with_stats))
+  if (!(result_sink= new select_materialize_with_stats))
+    DBUG_RETURN(TRUE);
+    
+  char buf[32];
+  uint len= my_snprintf(buf, sizeof(buf), "<subquery%d>", subquery_id);
+  char *name;
+  if (!(name= (char*)thd->alloc(len + 1)))
     DBUG_RETURN(TRUE);
+  memcpy(name, buf, len+1);
 
-  if (((select_union*) result)->create_result_table(
-                         thd, tmp_columns, TRUE, tmp_create_options,
-                         "materialized subselect", TRUE))
+  result_sink->get_tmp_table_param()->materialized_subquery= true;
+  if (result_sink->create_result_table(thd, tmp_columns, TRUE,
+                                       tmp_create_options,
+				       name, TRUE, TRUE))
     DBUG_RETURN(TRUE);
 
-  tmp_table= ((select_union*) result)->table;
+  tmp_table= result_sink->table;
+  result= result_sink;
 
   /*
     If the subquery has blobs, or the total key lenght is bigger than
@@ -3872,6 +4156,17 @@ bool subselect_hash_sj_engine::init_permanent(List<Item> *tmp_columns)
       !(lookup_engine= make_unique_engine()))
     DBUG_RETURN(TRUE);
 
+  /*
+    Repeat name resolution for 'cond' since cond is not part of any
+    clause of the query, and it is not 'fixed' during JOIN::prepare.
+  */
+  if (semi_join_conds && !semi_join_conds->fixed &&
+      semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds))
+    DBUG_RETURN(TRUE);
+  /* Let our engine reuse this query plan for materialization. */
+  materialize_join= materialize_engine->join;
+  materialize_join->change_result(result);
+
   DBUG_RETURN(FALSE);
 }
 
@@ -3914,7 +4209,8 @@ bool subselect_hash_sj_engine::make_semi_join_conds()
     DBUG_RETURN(TRUE);
 
   tmp_table_ref->init_one_table(STRING_WITH_LEN(""),
-                                STRING_WITH_LEN("materialized subselect"),
+                                tmp_table->alias.c_ptr(),
+                                tmp_table->alias.length(),
                                 NULL, TL_READ);
   tmp_table_ref->table= tmp_table;
 
@@ -3922,6 +4218,7 @@ bool subselect_hash_sj_engine::make_semi_join_conds()
   context->init();
   context->first_name_resolution_table=
     context->last_name_resolution_table= tmp_table_ref;
+  semi_join_conds_context= context;
   
   for (uint i= 0; i < item_in->left_expr->cols(); i++)
   {
@@ -3979,6 +4276,7 @@ subselect_hash_sj_engine::make_unique_engine()
     DBUG_RETURN(NULL);
 
   tab->table= tmp_table;
+  tab->preread_init_done= FALSE;
   tab->ref.tmp_table_index_lookup_init(thd, tmp_key, it, FALSE);
 
   DBUG_RETURN(new subselect_uniquesubquery_engine(thd, tab, item,
@@ -3986,41 +4284,22 @@ subselect_hash_sj_engine::make_unique_engine()
 }
 
 
-/**
-  Initialize members of the engine that need to be re-initilized at each
-  execution.
+subselect_hash_sj_engine::~subselect_hash_sj_engine()
+{
+  delete lookup_engine;
+  delete result;
+  if (tmp_table)
+    free_tmp_table(thd, tmp_table);
+}
 
-  @retval TRUE  if a memory allocation error occurred
-  @retval FALSE if success
-*/
 
-bool subselect_hash_sj_engine::init_runtime()
+int subselect_hash_sj_engine::prepare()
 {
   /*
     Create and optimize the JOIN that will be used to materialize
     the subquery if not yet created.
   */
-  materialize_engine->prepare();
-  /*
-    Repeat name resolution for 'cond' since cond is not part of any
-    clause of the query, and it is not 'fixed' during JOIN::prepare.
-  */
-  if (semi_join_conds && !semi_join_conds->fixed &&
-      semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds))
-    return TRUE;
-  /* Let our engine reuse this query plan for materialization. */
-  materialize_join= materialize_engine->join;
-  materialize_join->change_result(result);
-  return FALSE;
-}
-
-
-subselect_hash_sj_engine::~subselect_hash_sj_engine()
-{
-  delete lookup_engine;
-  delete result;
-  if (tmp_table)
-    free_tmp_table(thd, tmp_table);
+  return materialize_engine->prepare();
 }
 
 
@@ -4041,6 +4320,12 @@ void subselect_hash_sj_engine::cleanup()
   count_null_only_columns= 0;
   strategy= UNDEFINED;
   materialize_engine->cleanup();
+  /*
+    Restore the original Item_in_subselect engine. This engine is created once
+    at parse time and stored across executions, while all other materialization
+    related engines are created and chosen for each execution.
+  */
+  ((Item_in_subselect *) item)->engine= materialize_engine;
   if (lookup_engine_type == TABLE_SCAN_ENGINE ||
       lookup_engine_type == ROWID_MERGE_ENGINE)
   {
@@ -4057,6 +4342,206 @@ void subselect_hash_sj_engine::cleanup()
   DBUG_ASSERT(lookup_engine->engine_type() == UNIQUESUBQUERY_ENGINE);
   lookup_engine->cleanup();
   result->cleanup(); /* Resets the temp table as well. */
+  DBUG_ASSERT(tmp_table);
+  free_tmp_table(thd, tmp_table);
+  tmp_table= NULL;
+}
+
+
+/*
+  Get fanout produced by tables specified in the table_map
+*/
+
+double get_fanout_with_deps(JOIN *join, table_map tset)
+{
+  /* Handle the case of "Impossible WHERE" */
+  if (join->table_count == 0)
+    return 0.0;
+
+  /* First, recursively get all tables we depend on */
+  table_map deps_to_check= tset;
+  table_map checked_deps= 0;
+  table_map further_deps;
+  do
+  {
+    further_deps= 0;
+    Table_map_iterator tm_it(deps_to_check);
+    int tableno;
+    while ((tableno = tm_it.next_bit()) != Table_map_iterator::BITMAP_END)
+    {
+      /* get tableno's dependency tables that are not in needed_set */
+      further_deps |= join->map2table[tableno]->ref.depend_map & ~checked_deps;
+    }
+
+    checked_deps |= deps_to_check;
+    deps_to_check= further_deps;
+  } while (further_deps != 0);
+
+  
+  /* Now, walk the join order and calculate the fanout */
+  double fanout= 1;
+  for (JOIN_TAB *tab= first_top_level_tab(join, WITHOUT_CONST_TABLES); tab;
+       tab= next_top_level_tab(join, tab))
+  {
+    if ((tab->table->map & checked_deps) && !tab->emb_sj_nest && 
+        tab->records_read != 0)
+    {
+      fanout *= rows2double(tab->records_read);
+    }
+  } 
+  return fanout;
+}
+
+
+#if 0
+void check_out_index_stats(JOIN *join)
+{
+  ORDER *order;
+  uint n_order_items;
+
+  /*
+    First, collect the keys that we can use in each table.
+    We can use a key if 
+    - all tables refer to it.
+  */
+  key_map key_start_use[MAX_TABLES];
+  key_map key_infix_use[MAX_TABLES];
+  table_map key_used=0;
+  table_map non_key_used= 0;
+  
+  bzero(&key_start_use, sizeof(key_start_use)); //psergey-todo: safe initialization!
+  bzero(&key_infix_use, sizeof(key_infix_use));
+  
+  for (order= join->group_list; order; order= order->next)
+  {
+    Item *item= order->item[0];
+
+    if (item->real_type() == Item::FIELD_ITEM)
+    {
+      if (item->used_tables() & OUTER_REF_TABLE_BIT)
+        continue; /* outside references are like constants for us */
+
+      Field *field= ((Item_field*)item->real_item())->field;
+      uint table_no= field->table->tablenr;
+      if (!(non_key_used && table_map(1) << table_no) && 
+          !field->part_of_key.is_clear_all())
+      {
+        key_map infix_map= field->part_of_key;
+        infix_map.subtract(field->key_start);
+        key_start_use[table_no].merge(field->key_start);
+        key_infix_use[table_no].merge(infix_map);
+        key_used |= table_no;
+      }
+      continue;
+    }
+    /* 
+      Note: the below will cause clauses like GROUP BY YEAR(date) not to be
+      handled. 
+    */
+    non_key_used |= item->used_tables();
+  }
+  
+  Table_map_iterator tm_it(key_used & ~non_key_used);
+  int tableno;
+  while ((tableno = tm_it.next_bit()) != Table_map_iterator::BITMAP_END)
+  {
+    key_map::iterator key_it(key_start_use);
+    int keyno;
+    while ((keyno = tm_it.next_bit()) != key_map::iterator::BITMAP_END)
+    {
+      for (order= join->group_list; order; order= order->next)
+      {
+        Item *item= order->item[0];
+        if (item->used_tables() & (table_map(1) << tableno))
+        {
+          DBUG_ASSERT(item->real_type() == Item::FIELD_ITEM);
+        }
+      }
+      /*
+      if (continuation)
+      {
+        walk through list and find which key parts are occupied;
+        // note that the above can't be made any faster.
+      }
+      else
+        use rec_per_key[0];
+      
+      find out the cardinality.
+      check if cardinality decreases if we use it;
+      */
+    }
+  }
+}
+#endif
+
+
+/*
+  Get an estimate of how many records will be produced after the GROUP BY
+  operation.
+
+  @param join           Join we're operating on 
+  @param join_op_rows   How many records will be produced by the join
+                        operations (this is what join optimizer produces)
+  
+  @seealso
+     See also optimize_semijoin_nests(), grep for "Adjust output cardinality 
+     estimates".  Very similar code there that is not joined with this one
+     because we operate on different data structs and too much effort is
+     needed to abstract them out.
+
+  @return
+     Number of records we expect to get after the GROUP BY operation
+*/
+
+double get_post_group_estimate(JOIN* join, double join_op_rows)
+{
+  table_map tables_in_group_list= table_map(0);
+
+  /* Find out which tables are used in GROUP BY list */
+  for (ORDER *order= join->group_list; order; order= order->next)
+  {
+    Item *item= order->item[0];
+    if (item->used_tables() & RAND_TABLE_BIT)
+    {
+      /* Each join output record will be in its own group */
+      return join_op_rows;
+    }
+    tables_in_group_list|= item->used_tables();
+  }
+  tables_in_group_list &= ~PSEUDO_TABLE_BITS;
+
+  /*
+    Use join fanouts to calculate the max. number of records in the group-list
+  */
+  double fanout_rows[MAX_KEY];
+  bzero(&fanout_rows, sizeof(fanout_rows));
+  double out_rows;
+  
+  out_rows= get_fanout_with_deps(join, tables_in_group_list);
+
+#if 0
+  /* The following will be needed when making use of index stats: */
+  /* 
+    Also generate max. number of records for each of the tables mentioned 
+    in the group-list. We'll use that a baseline number that we'll try to 
+    reduce by using
+     - #table-records 
+     - index statistics.
+  */
+  Table_map_iterator tm_it(tables_in_group_list);
+  int tableno;
+  while ((tableno = tm_it.next_bit()) != Table_map_iterator::BITMAP_END)
+  {
+    fanout_rows[tableno]= get_fanout_with_deps(join, table_map(1) << tableno);
+  }
+  
+  /*
+    Try to bring down estimates using index statistics.
+  */
+  //check_out_index_stats(join);
+#endif
+
+  return out_rows;
 }
 
 
@@ -4085,9 +4570,8 @@ int subselect_hash_sj_engine::exec()
     the subquery predicate.
   */
   thd->lex->current_select= materialize_engine->select_lex;
-  if ((res= materialize_join->optimize()))
-    goto err; /* purecov: inspected */
-  DBUG_ASSERT(!is_materialized); /* We should materialize only once. */
+  /* The subquery should be optimized, and materialized only once. */
+  DBUG_ASSERT(materialize_join->optimized && !is_materialized);
   materialize_join->exec();
   if ((res= test(materialize_join->error || thd->is_fatal_error)))
     goto err;
@@ -4111,11 +4595,10 @@ int subselect_hash_sj_engine::exec()
   tmp_table->file->info(HA_STATUS_VARIABLE);
   if (!tmp_table->file->stats.records)
   {
-    item_in->value= FALSE;
     /* The value of IN will not change during this execution. */
-    item_in->is_constant= TRUE;
+    item_in->reset();
+    item_in->make_const();
     item_in->set_first_execution();
-    /* TIMOUR: check if we need this: item_in->null_value= FALSE; */
     DBUG_RETURN(FALSE);
   }
 
@@ -4131,39 +4614,61 @@ int subselect_hash_sj_engine::exec()
   if (strategy == PARTIAL_MATCH)
   {
     uint count_pm_keys; /* Total number of keys needed for partial matching. */
-    MY_BITMAP *nn_key_parts; /* The key parts of the only non-NULL index. */
-    uint covering_null_row_width;
+    MY_BITMAP *nn_key_parts= NULL; /* Key parts of the only non-NULL index. */
+    uint count_non_null_columns= 0; /* Number of columns in nn_key_parts. */
+    bool has_covering_null_row;
+    bool has_covering_null_columns;
     select_materialize_with_stats *result_sink=
       (select_materialize_with_stats *) result;
+    uint field_count= tmp_table->s->fields;
 
-    nn_key_parts= (count_partial_match_columns < tmp_table->s->fields) ?
-                  &non_null_key_parts : NULL;
+    if (count_partial_match_columns < field_count)
+    {
+      nn_key_parts= &non_null_key_parts;
+      count_non_null_columns= bitmap_bits_set(nn_key_parts);
+    }
+    has_covering_null_row= (result_sink->get_max_nulls_in_row() == field_count);
+    has_covering_null_columns= (count_non_null_columns +
+                                count_null_only_columns == field_count);
 
-    if (result_sink->get_max_nulls_in_row() ==
-        tmp_table->s->fields -
-        (nn_key_parts ? bitmap_bits_set(nn_key_parts) : 0))
-      covering_null_row_width= result_sink->get_max_nulls_in_row();
-    else
-      covering_null_row_width= 0;
+    if (has_covering_null_row && has_covering_null_columns)
+    {
+      /*
+        The whole table consist of only NULL values. The result of IN is
+        a constant UNKNOWN.
+      */
+      DBUG_ASSERT(tmp_table->file->stats.records == 1);
+      item_in->value= 0;
+      item_in->null_value= 1;
+      item_in->make_const();
+      item_in->set_first_execution();
+      DBUG_RETURN(FALSE);
+    }
 
-    if (covering_null_row_width)
-      count_pm_keys= nn_key_parts ? 1 : 0;
+    if (has_covering_null_row)
+    {
+      DBUG_ASSERT(count_partial_match_columns = field_count);
+      count_pm_keys= 0;
+    }
+    else if (has_covering_null_columns)
+      count_pm_keys= 1;
     else
       count_pm_keys= count_partial_match_columns - count_null_only_columns +
-        (nn_key_parts ? 1 : 0);
+                     (nn_key_parts ? 1 : 0);
 
     choose_partial_match_strategy(test(nn_key_parts),
-                                  test(covering_null_row_width),
+                                  has_covering_null_row,
                                   &partial_match_key_parts);
     DBUG_ASSERT(strategy == PARTIAL_MATCH_MERGE ||
                 strategy == PARTIAL_MATCH_SCAN);
     if (strategy == PARTIAL_MATCH_MERGE)
     {
       pm_engine=
-        new subselect_rowid_merge_engine((subselect_uniquesubquery_engine*)
+        new subselect_rowid_merge_engine(thd, (subselect_uniquesubquery_engine*)
                                          lookup_engine, tmp_table,
                                          count_pm_keys,
-                                         covering_null_row_width,
+                                         has_covering_null_row,
+                                         has_covering_null_columns,
                                          item, result,
                                          semi_join_conds->argument_list());
       if (!pm_engine ||
@@ -4184,11 +4689,12 @@ int subselect_hash_sj_engine::exec()
     if (strategy == PARTIAL_MATCH_SCAN)
     {
       if (!(pm_engine=
-            new subselect_table_scan_engine((subselect_uniquesubquery_engine*)
+            new subselect_table_scan_engine(thd, (subselect_uniquesubquery_engine*)
                                             lookup_engine, tmp_table,
                                             item, result,
                                             semi_join_conds->argument_list(),
-                                            covering_null_row_width)))
+                                            has_covering_null_row,
+                                            has_covering_null_columns)))
       {
         /* This is an irrecoverable error. */
         res= 1;
@@ -4242,7 +4748,8 @@ bool subselect_hash_sj_engine::no_tables()
 }
 
 bool subselect_hash_sj_engine::change_result(Item_subselect *si,
-                                             select_result_interceptor *res)
+                                             select_result_interceptor *res,
+                                             bool temp __attribute__((unused)))
 {
   DBUG_ASSERT(FALSE);
   return TRUE;
@@ -4375,8 +4882,8 @@ bool Ordered_key::alloc_keys_buffers()
 {
   DBUG_ASSERT(key_buff_elements > 0);
 
-  if (!(key_buff= (rownum_t*) my_malloc(key_buff_elements * sizeof(rownum_t),
-                                        MYF(MY_WME))))
+  if (!(key_buff= (rownum_t*) my_malloc((size_t)(key_buff_elements * 
+    sizeof(rownum_t)), MYF(MY_WME))))
     return TRUE;
 
   /*
@@ -4385,7 +4892,7 @@ bool Ordered_key::alloc_keys_buffers()
     lookup offset.
   */
   /* Notice that max_null_row is max array index, we need count, so +1. */
-  if (bitmap_init(&null_key, NULL, max_null_row + 1, FALSE))
+  if (bitmap_init(&null_key, NULL, (uint)(max_null_row + 1), FALSE))
     return TRUE;
 
   cur_key_idx= HA_POS_ERROR;
@@ -4449,7 +4956,7 @@ Ordered_key::cmp_keys_by_row_data_and_rownum(Ordered_key *key,
 
 void Ordered_key::sort_keys()
 {
-  my_qsort2(key_buff, key_buff_elements, sizeof(rownum_t),
+  my_qsort2(key_buff, (size_t) key_buff_elements, sizeof(rownum_t),
             (qsort2_cmp) &cmp_keys_by_row_data_and_rownum, (void*) this);
   /* Invalidate the current row position. */
   cur_key_idx= HA_POS_ERROR;
@@ -4619,15 +5126,17 @@ void Ordered_key::print(String *str)
 
 
 subselect_partial_match_engine::subselect_partial_match_engine(
-  subselect_uniquesubquery_engine *engine_arg,
+  THD *thd_arg, subselect_uniquesubquery_engine *engine_arg,
   TABLE *tmp_table_arg, Item_subselect *item_arg,
   select_result_interceptor *result_arg,
   List<Item> *equi_join_conds_arg,
-  uint covering_null_row_width_arg)
-  :subselect_engine(item_arg, result_arg),
+  bool has_covering_null_row_arg,
+  bool has_covering_null_columns_arg)
+  :subselect_engine(thd_arg, item_arg, result_arg),
    tmp_table(tmp_table_arg), lookup_engine(engine_arg),
    equi_join_conds(equi_join_conds_arg),
-   covering_null_row_width(covering_null_row_width_arg)
+   has_covering_null_row(has_covering_null_row_arg),
+   has_covering_null_columns(has_covering_null_columns_arg)
 {}
 
 
@@ -4665,7 +5174,7 @@ int subselect_partial_match_engine::exec()
     }
   }
 
-  if (covering_null_row_width == tmp_table->s->fields)
+  if (has_covering_null_row)
   {
     /*
       If there is a NULL-only row that coveres all columns the result of IN
@@ -4729,7 +5238,6 @@ void subselect_partial_match_engine::print(String *str,
 /*
   @param non_null_key_parts  
   @param partial_match_key_parts  A union of all single-column NULL key parts.
-  @param count_partial_match_columns Number of NULL keyparts (set bits above).
 
   @retval FALSE  the engine was initialized successfully
   @retval TRUE   there was some (memory allocation) error during initialization,
@@ -4750,23 +5258,29 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
   Item_in_subselect *item_in= (Item_in_subselect*) item;
   int error;
 
-  if (keys_count == 0)
+  if (merge_keys_count == 0)
   {
+    DBUG_ASSERT(bitmap_bits_set(partial_match_key_parts) == 0 ||
+                has_covering_null_row);
     /* There is nothing to initialize, we will only do regular lookups. */
     return FALSE;
   }
 
-  DBUG_ASSERT(!covering_null_row_width || (covering_null_row_width &&
-                                           keys_count == 1 &&
-                                           non_null_key_parts));
+  /*
+    If all nullable columns contain only NULLs, there must be one index
+    over all non-null columns.
+  */
+  DBUG_ASSERT(!has_covering_null_columns ||
+              (has_covering_null_columns &&
+               merge_keys_count == 1 && non_null_key_parts));
   /*
     Allocate buffers to hold the merged keys and the mapping between rowids and
     row numbers.
   */
-  if (!(merge_keys= (Ordered_key**) thd->alloc(keys_count *
+  if (!(merge_keys= (Ordered_key**) thd->alloc(merge_keys_count *
                                                sizeof(Ordered_key*))) ||
-      !(row_num_to_rowid= (uchar*) my_malloc(row_count * rowid_length *
-                                             sizeof(uchar), MYF(MY_WME))))
+      !(row_num_to_rowid= (uchar*) my_malloc((size_t)(row_count * rowid_length),
+        MYF(MY_WME))))
     return TRUE;
 
   /* Create the only non-NULL key if there is any. */
@@ -4782,15 +5296,13 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
   }
 
   /*
-    If there is a covering NULL row, the only key that is needed is the
-    only non-NULL key that is already created above. We create keys on
-    NULL-able columns only if there is no covering NULL row.
+    If all nullable columns contain NULLs, the only key that is needed is the
+    only non-NULL key that is already created above.
   */
-  if (!covering_null_row_width)
+  if (!has_covering_null_columns)
   {
-    if (bitmap_init_memroot(&matching_keys, keys_count, thd->mem_root) ||
-        bitmap_init_memroot(&matching_outer_cols, keys_count, thd->mem_root) ||
-        bitmap_init_memroot(&null_only_columns, keys_count, thd->mem_root))
+    if (bitmap_init_memroot(&matching_keys, merge_keys_count, thd->mem_root) ||
+        bitmap_init_memroot(&matching_outer_cols, merge_keys_count, thd->mem_root))
       return TRUE;
 
     /*
@@ -4799,31 +5311,25 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
     */
     for (uint i= 0; i < partial_match_key_parts->n_bits; i++)
     {
-      if (!bitmap_is_set(partial_match_key_parts, i))
+      /* Skip columns that have no NULLs, or contain only NULLs. */
+      if (!bitmap_is_set(partial_match_key_parts, i) ||
+          result_sink->get_null_count_of_col(i) == row_count)
         continue;
 
-      if (result_sink->get_null_count_of_col(i) == row_count)
-      {
-        bitmap_set_bit(&null_only_columns, cur_keyid);
-        continue;
-      }
-      else
-      {
-        merge_keys[cur_keyid]= new Ordered_key(
+      merge_keys[cur_keyid]= new Ordered_key(
                                      cur_keyid, tmp_table,
                                      item_in->left_expr->element_index(i),
                                      result_sink->get_null_count_of_col(i),
                                      result_sink->get_min_null_of_col(i),
                                      result_sink->get_max_null_of_col(i),
                                      row_num_to_rowid);
-        if (merge_keys[cur_keyid]->init(i))
-          return TRUE;
-        merge_keys[cur_keyid]->first();
-      }
+      if (merge_keys[cur_keyid]->init(i))
+        return TRUE;
+      merge_keys[cur_keyid]->first();
       ++cur_keyid;
     }
   }
-  DBUG_ASSERT(cur_keyid == keys_count);
+  DBUG_ASSERT(cur_keyid == merge_keys_count);
 
   /* Populate the indexes with data from the temporary table. */
   if (tmp_table->file->ha_rnd_init_with_error(1))
@@ -4864,7 +5370,7 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
       non_null_key->add_key(cur_rownum);
     }
 
-    for (uint i= (non_null_key ? 1 : 0); i < keys_count; i++)
+    for (uint i= (non_null_key ? 1 : 0); i < merge_keys_count; i++)
     {
       /*
         Check if the first and only indexed column contains NULL in the curent
@@ -4881,14 +5387,14 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
   tmp_table->file->ha_rnd_end();
 
   /* Sort all the keys by their NULL selectivity. */
-  my_qsort(merge_keys, keys_count, sizeof(Ordered_key*),
+  my_qsort(merge_keys, merge_keys_count, sizeof(Ordered_key*),
            (qsort_cmp) cmp_keys_by_null_selectivity);
 
   /* Sort the keys in each of the indexes. */
-  for (uint i= 0; i < keys_count; i++)
+  for (uint i= 0; i < merge_keys_count; i++)
     merge_keys[i]->sort_keys();
 
-  if (init_queue(&pq, keys_count, 0, FALSE,
+  if (init_queue(&pq, merge_keys_count, 0, FALSE,
                  subselect_rowid_merge_engine::cmp_keys_by_cur_rownum, NULL,
                  0, 0))
     return TRUE;
@@ -4900,10 +5406,10 @@ subselect_rowid_merge_engine::init(MY_BITMAP *non_null_key_parts,
 subselect_rowid_merge_engine::~subselect_rowid_merge_engine()
 {
   /* None of the resources below is allocated if there are no ordered keys. */
-  if (keys_count)
+  if (merge_keys_count)
   {
     my_free(row_num_to_rowid);
-    for (uint i= 0; i < keys_count; i++)
+    for (uint i= 0; i < merge_keys_count; i++)
       delete merge_keys[i];
     delete_queue(&pq);
     if (tmp_table->file->inited == handler::RND)
@@ -4961,6 +5467,10 @@ subselect_rowid_merge_engine::cmp_keys_by_cur_rownum(void *arg,
   Check if certain table row contains a NULL in all columns for which there is
   no match in the corresponding value index.
 
+  @note
+  There is no need to check the columns that contain only NULLs, because
+  those are guaranteed to match.
+
   @retval TRUE if a NULL row exists
   @retval FALSE otherwise
 */
@@ -4968,16 +5478,14 @@ subselect_rowid_merge_engine::cmp_keys_by_cur_rownum(void *arg,
 bool subselect_rowid_merge_engine::test_null_row(rownum_t row_num)
 {
   Ordered_key *cur_key;
-  uint cur_id;
-  for (uint i = 0; i < keys_count; i++)
+  for (uint i = 0; i < merge_keys_count; i++)
   {
     cur_key= merge_keys[i];
-    cur_id= cur_key->get_keyid();
-    if (bitmap_is_set(&matching_keys, cur_id))
+    if (bitmap_is_set(&matching_keys, cur_key->get_keyid()))
     {
       /*
-        The key 'i' (with id 'cur_keyid') already matches a value in row 'row_num',
-        thus we skip it as it can't possibly match a NULL.
+        The key 'i' (with id 'cur_keyid') already matches a value in row
+        'row_num', thus we skip it as it can't possibly match a NULL.
       */
       continue;
     }
@@ -5022,11 +5530,10 @@ bool subselect_rowid_merge_engine::partial_match()
   }
 
   /*
-    If there is a NULL (sub)row that covers all NULL-able columns,
-    then there is a guranteed partial match, and we don't need to search
-    for the matching row.
-   */
-  if (covering_null_row_width)
+    If all nullable columns contain only NULLs, then there is a guranteed
+    partial match, and we don't need to search for a matching row.
+  */
+  if (has_covering_null_columns)
   {
     res= TRUE;
     goto end;
@@ -5038,10 +5545,10 @@ bool subselect_rowid_merge_engine::partial_match()
     Do not add the non_null_key, since it was already processed above.
   */
   bitmap_clear_all(&matching_outer_cols);
-  for (uint i= test(non_null_key); i < keys_count; i++)
+  for (uint i= test(non_null_key); i < merge_keys_count; i++)
   {
     DBUG_ASSERT(merge_keys[i]->get_column_count() == 1);
-    if (merge_keys[i]->get_search_key(0)->is_null())
+    if (merge_keys[i]->get_search_key(0)->null_value)
     {
       ++count_nulls_in_search_key;
       bitmap_set_bit(&matching_outer_cols, merge_keys[i]->get_keyid());
@@ -5076,7 +5583,6 @@ bool subselect_rowid_merge_engine::partial_match()
 
   min_key= (Ordered_key*) queue_remove_top(&pq);
   min_row_num= min_key->current();
-  bitmap_copy(&matching_keys, &null_only_columns);
   bitmap_set_bit(&matching_keys, min_key->get_keyid());
   bitmap_union(&matching_keys, &matching_outer_cols);
   if (min_key->next_same())
@@ -5112,7 +5618,6 @@ bool subselect_rowid_merge_engine::partial_match()
       {
         min_key= cur_key;
         min_row_num= cur_row_num;
-        bitmap_copy(&matching_keys, &null_only_columns);
         bitmap_set_bit(&matching_keys, min_key->get_keyid());
         bitmap_union(&matching_keys, &matching_outer_cols);
       }
@@ -5140,15 +5645,17 @@ end:
 
 
 subselect_table_scan_engine::subselect_table_scan_engine(
-  subselect_uniquesubquery_engine *engine_arg,
+  THD *thd_arg, subselect_uniquesubquery_engine *engine_arg,
   TABLE *tmp_table_arg,
   Item_subselect *item_arg,
   select_result_interceptor *result_arg,
   List<Item> *equi_join_conds_arg,
-  uint covering_null_row_width_arg)
-  :subselect_partial_match_engine(engine_arg, tmp_table_arg, item_arg,
+  bool has_covering_null_row_arg,
+  bool has_covering_null_columns_arg)
+  :subselect_partial_match_engine(thd_arg, engine_arg, tmp_table_arg, item_arg,
                                   result_arg, equi_join_conds_arg,
-                                  covering_null_row_width_arg)
+                                  has_covering_null_row_arg,
+                                  has_covering_null_columns_arg)
 {}
 
 
@@ -5187,10 +5694,6 @@ bool subselect_table_scan_engine::partial_match()
 
   tmp_table->file->extra_opt(HA_EXTRA_CACHE,
                              current_thd->variables.read_buff_size);
-  /*
-  TIMOUR:
-  scan_table() also calls "table->null_row= 0;", why, do we need it?
-  */
   for (;;)
   {
     error= tmp_table->file->ha_rnd_next(tmp_table->record[0]);
@@ -5239,3 +5742,4 @@ end:
 void subselect_table_scan_engine::cleanup()
 {
 }
+
diff --git a/sql/item_subselect.h b/sql/item_subselect.h
index f85067c56fa..de3279aeeef 100644
--- a/sql/item_subselect.h
+++ b/sql/item_subselect.h
@@ -46,30 +46,11 @@ class Cached_item;
 
 class Item_subselect :public Item_result_field
 {
-  bool value_assigned; 		/* value already assigned to subselect */
+  bool value_assigned;   /* value already assigned to subselect */
+  bool own_engine;  /* the engine was not taken from other Item_subselect */
 protected:
   /* thread handler, will be assigned in fix_fields only */
   THD *thd;
-  /* 
-    Used inside Item_subselect::fix_fields() according to this scenario:
-      > Item_subselect::fix_fields
-        > engine->prepare
-          > child_join->prepare
-            (Here we realize we need to do the rewrite and set
-             substitution= some new Item, eg. Item_in_optimizer )
-          < child_join->prepare
-        < engine->prepare
-        *ref= substitution;
-      < Item_subselect::fix_fields
-  */
-  Item *substitution;
-public:
-  /* unit of subquery */
-  st_select_lex_unit *unit;
-protected:
-  Item *expr_cache;
-  /* engine that perform execution of subselect (single select or union) */
-  subselect_engine *engine;
   /* old engine if engine was changed */
   subselect_engine *old_engine;
   /* cache of used external tables */
@@ -85,7 +66,38 @@ protected:
   
   bool inside_first_fix_fields;
   bool done_first_fix_fields;
+  Item *expr_cache;
+  /*
+    Set to TRUE if at optimization or execution time we determine that this
+    item's value is a constant. We need this member because it is not possible
+    to substitute 'this' with a constant item.
+  */
+  bool forced_const;
+#ifndef DBUG_OFF
+  /* Count the number of times this subquery predicate has been executed. */
+  uint exec_counter;
+#endif
 public:
+  /* 
+    Used inside Item_subselect::fix_fields() according to this scenario:
+      > Item_subselect::fix_fields
+        > engine->prepare
+          > child_join->prepare
+            (Here we realize we need to do the rewrite and set
+             substitution= some new Item, eg. Item_in_optimizer )
+          < child_join->prepare
+        < engine->prepare
+        *ref= substitution;
+        substitution= NULL;
+      < Item_subselect::fix_fields
+  */
+  /* TODO make this protected member again. */
+  Item *substitution;
+  /* engine that perform execution of subselect (single select or union) */
+  /* TODO make this protected member again. */
+  subselect_engine *engine;
+  /* unit of subquery */
+  st_select_lex_unit *unit;
   /* A reference from inside subquery predicate to somewhere outside of it */
   class Ref_to_outside : public Sql_alloc
   {
@@ -104,14 +116,6 @@ public:
   List<Ref_to_outside> upper_refs;
   st_select_lex *parent_select;
 
-  /**
-     List of references on items subquery depends on (externally resolved);
-
-     @note We can't store direct links on Items because it could be
-           substituted with other item (for example for grouping).
-   */
-  List<Item*> depends_on;
-
   /*
    TRUE<=>Table Elimination has made it redundant to evaluate this select
           (and so it is not part of QEP, etc)
@@ -126,13 +130,18 @@ public:
   /* TRUE <=> The underlying SELECT is correlated w.r.t some ancestor select */
   bool is_correlated; 
 
-  enum trans_res {RES_OK, RES_REDUCE, RES_ERROR};
   enum subs_type {UNKNOWN_SUBS, SINGLEROW_SUBS,
 		  EXISTS_SUBS, IN_SUBS, ALL_SUBS, ANY_SUBS};
 
   Item_subselect();
 
   virtual subs_type substype() { return UNKNOWN_SUBS; }
+  bool is_in_predicate()
+  {
+    return (substype() == Item_subselect::IN_SUBS ||
+            substype() == Item_subselect::ALL_SUBS ||
+            substype() == Item_subselect::ANY_SUBS);
+  }
 
   /*
     We need this method, because some compilers do not allow 'this'
@@ -149,7 +158,7 @@ public:
     eliminated= FALSE;
     null_value= 1;
   }
-  virtual trans_res select_transformer(JOIN *join);
+  virtual bool select_transformer(JOIN *join);
   bool assigned() { return value_assigned; }
   void assigned(bool a) { value_assigned= a; }
   enum Type type() const;
@@ -163,12 +172,21 @@ public:
   void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
   virtual bool exec();
+  /*
+    If subquery optimization or execution determines that the subquery has
+    an empty result, mark the subquery predicate as a constant value.
+  */
+  void make_const()
+  { 
+    used_tables_cache= 0;
+    const_item_cache= 0;
+    forced_const= TRUE; 
+  }
   virtual void fix_length_and_dec();
   table_map used_tables() const;
   table_map not_null_tables() const { return 0; }
   bool const_item() const;
   inline table_map get_used_tables_cache() { return used_tables_cache; }
-  inline bool get_const_item_cache() { return const_item_cache; }
   Item *get_tmp_table_item(THD *thd);
   void update_used_tables();
   virtual void print(String *str, enum_query_type query_type);
@@ -186,6 +204,7 @@ public:
   */
   bool is_evaluated() const;
   bool is_uncacheable() const;
+  bool is_expensive() { return TRUE; }
 
   /*
     Used by max/min subquery to initialize value presence registration
@@ -195,11 +214,23 @@ public:
   enum_parsing_place place() { return parsing_place; }
   bool walk(Item_processor processor, bool walk_subquery, uchar *arg);
   bool mark_as_eliminated_processor(uchar *arg);
+  bool eliminate_subselect_processor(uchar *arg);
+  bool set_fake_select_as_master_processor(uchar *arg);
   bool enumerate_field_refs_processor(uchar *arg);
   bool check_vcol_func_processor(uchar *int_arg) 
   {
     return trace_unsupported_by_check_vcol_func_processor("subselect");
   }
+  /**
+    Callback to test if an IN predicate is expensive.
+
+    @notes
+    The return value affects the behavior of make_cond_for_table().
+
+    @retval TRUE  if the predicate is expensive
+    @retval FALSE otherwise
+  */
+  bool is_expensive_processor(uchar *arg) { return TRUE; }
 
   /**
     Get the SELECT_LEX structure associated with this Item.
@@ -208,6 +239,7 @@ public:
   st_select_lex* get_select_lex();
   const char *func_name() const { DBUG_ASSERT(0); return "subselect"; }
   virtual bool expr_cache_is_needed(THD *);
+  virtual void get_cache_parameters(List<Item> &parameters);
 
   friend class select_result_interceptor;
   friend class Item_in_optimizer;
@@ -237,7 +269,7 @@ public:
   subs_type substype() { return SINGLEROW_SUBS; }
 
   void reset();
-  trans_res select_transformer(JOIN *join);
+  bool select_transformer(JOIN *join);
   void store(uint i, Item* item);
   double val_real();
   longlong val_int ();
@@ -298,6 +330,8 @@ class Item_exists_subselect :public Item_subselect
 protected:
   bool value; /* value of this item (boolean: exists/not-exists) */
 
+  void init_length_and_dec();
+
 public:
   Item_exists_subselect(st_select_lex *select_lex);
   Item_exists_subselect(): Item_subselect() {}
@@ -326,6 +360,26 @@ public:
 };
 
 
+TABLE_LIST * const NO_JOIN_NEST=(TABLE_LIST*)0x1;
+
+/*
+  Possible methods to execute an IN predicate. These are set by the optimizer
+  based on user-set optimizer switches, semantic analysis and cost comparison.
+*/
+#define SUBS_NOT_TRANSFORMED 0 /* No execution method was chosen for this IN. */
+#define SUBS_SEMI_JOIN 1       /* IN was converted to semi-join. */
+#define SUBS_IN_TO_EXISTS 2    /* IN was converted to correlated EXISTS. */
+#define SUBS_MATERIALIZATION 4 /* Execute IN via subquery materialization. */
+/* Partial matching substrategies of MATERIALIZATION. */
+#define SUBS_PARTIAL_MATCH_ROWID_MERGE 8
+#define SUBS_PARTIAL_MATCH_TABLE_SCAN 16
+/* ALL/ANY will be transformed with max/min optimization */
+/*   The subquery has not aggregates, transform it into a MAX/MIN query. */
+#define SUBS_MAXMIN_INJECTED 32
+/*   The subquery has aggregates, use a special max/min subselect engine. */
+#define SUBS_MAXMIN_ENGINE 64
+
+
 /**
   Representation of IN subquery predicates of the form
   "left_expr IN (SELECT ...)".
@@ -343,8 +397,6 @@ public:
 
 class Item_in_subselect :public Item_exists_subselect
 {
-public:
-  Item *left_expr;
 protected:
   /*
     Cache of the left operand of the subquery predicate. Allocated in the
@@ -352,43 +404,47 @@ protected:
   */
   List<Cached_item> *left_expr_cache;
   bool first_execution;
-  /*
-    Set to TRUE if at query execution time we determine that this item's
-    value is a constant during this execution. We need this member because
-    it is not possible to substitute 'this' with a constant item.
-  */
-  bool is_constant;
 
   /*
     expr & optimizer used in subselect rewriting to store Item for
     all JOIN in UNION
   */
   Item *expr;
-  Item_in_optimizer *optimizer;
   bool was_null;
   bool abort_on_null;
 public:
+  Item_in_optimizer *optimizer;
+protected:
   /* Used to trigger on/off conditions that were pushed down to subselect */
   bool *pushed_cond_guards;
-  
+  Comp_creator *func;
+
+protected:
+  bool init_cond_guards();
+  bool select_in_like_transformer(JOIN *join);
+  bool single_value_transformer(JOIN *join);
+  bool row_value_transformer(JOIN * join);
+  bool fix_having(Item *having, st_select_lex *select_lex);
+  bool create_single_in_to_exists_cond(JOIN * join,
+                                       Item **where_item,
+                                       Item **having_item);
+  bool create_row_in_to_exists_cond(JOIN * join,
+                                    Item **where_item,
+                                    Item **having_item);
+public:
+  Item *left_expr;
   /* Priority of this predicate in the convert-to-semi-join-nest process. */
   int sj_convert_priority;
   /*
     Used by subquery optimizations to keep track about in which clause this
     subquery predicate is located: 
-      (TABLE_LIST*) 1   - the predicate is an AND-part of the WHERE
+      NO_JOIN_NEST      - the predicate is an AND-part of the WHERE
       join nest pointer - the predicate is an AND-part of ON expression
                           of a join nest   
       NULL              - for all other locations
     See also THD::emb_on_expr_nest.
   */
   TABLE_LIST *emb_on_expr_nest;
-  /* 
-    Location of the subquery predicate. It is either
-     - pointer to join nest if the subquery predicate is in the ON expression
-     - (TABLE_LIST*)1 if the predicate is in the WHERE.
-  */
-  TABLE_LIST *expr_join_nest;
   /*
     Types of left_expr and subquery's select list allow to perform subquery
     materialization. Currently, we set this to FALSE when it as well could
@@ -400,16 +456,36 @@ public:
     Same as above, but they also allow to scan the materialized table. 
   */
   bool sjm_scan_allowed;
+  double jtbm_read_time;
+  double jtbm_record_count;
 
-  /* The method chosen to execute the IN predicate.  */
-  enum enum_exec_method {
-    NOT_TRANSFORMED, /* No execution method was chosen for this IN. */
-    SEMI_JOIN,   /* IN was converted to semi-join nest and should be removed. */
-    IN_TO_EXISTS, /* IN was converted to correlated EXISTS. */
-    MATERIALIZATION /* IN will be executed via subquery materialization. */
-  };
-  enum_exec_method exec_method;
+  /* A bitmap of possible execution strategies for an IN predicate. */
+  uchar in_strategy;
+
+  bool is_jtbm_merged;
 
+  /*
+    TRUE<=>this is a flattenable semi-join, false overwise.
+  */
+  bool is_flattenable_semijoin;
+
+  /*
+    TRUE<=>registered in the list of semijoins in outer select
+  */
+  bool is_registered_semijoin;
+
+  /*
+    Used to determine how this subselect item is represented in the item tree,
+    in case there is a need to locate it there and replace with something else.
+    Two options are possible:
+      1. This item is there 'as-is'.
+      1. This item is wrapped within Item_in_optimizer.
+  */
+  Item *original_item()
+  {
+    return is_flattenable_semijoin ? (Item*)this : (Item*)optimizer;
+  }
+  
   bool *get_cond_guard(int i)
   {
     return pushed_cond_guards ? pushed_cond_guards + i : NULL;
@@ -426,9 +502,11 @@ public:
   Item_in_subselect(Item * left_expr, st_select_lex *select_lex);
   Item_in_subselect()
     :Item_exists_subselect(), left_expr_cache(0), first_execution(TRUE),
-    is_constant(FALSE), optimizer(0), abort_on_null(0),
-    pushed_cond_guards(NULL), exec_method(NOT_TRANSFORMED), upper_item(0)
-  {}
+     abort_on_null(0), optimizer(0),
+    pushed_cond_guards(NULL), func(NULL), in_strategy(SUBS_NOT_TRANSFORMED),
+    is_jtbm_merged(FALSE),
+    upper_item(0)
+    {}
   void cleanup();
   subs_type substype() { return IN_SUBS; }
   void reset() 
@@ -438,13 +516,10 @@ public:
     null_value= 0;
     was_null= 0;
   }
-  trans_res select_transformer(JOIN *join);
-  trans_res select_in_like_transformer(JOIN *join, Comp_creator *func);
-  trans_res single_value_transformer(JOIN *join, Comp_creator *func);
-  trans_res row_value_transformer(JOIN * join);
-  trans_res single_value_in_to_exists_transformer(JOIN * join,
-                                                  Comp_creator *func);
-  trans_res row_value_in_to_exists_transformer(JOIN * join);
+  bool select_transformer(JOIN *join);
+  bool create_in_to_exists_cond(JOIN *join_arg);
+  bool inject_in_to_exists_cond(JOIN *join_arg);
+
   virtual bool exec();
   longlong val_int();
   double val_real();
@@ -457,15 +532,16 @@ public:
   bool test_limit(st_select_lex_unit *unit);
   virtual void print(String *str, enum_query_type query_type);
   bool fix_fields(THD *thd, Item **ref);
+  void fix_length_and_dec();
   void fix_after_pullout(st_select_lex *new_parent, Item **ref);
   void update_used_tables();
-  bool setup_engine();
+  bool setup_mat_engine();
   bool init_left_expr_cache();
   /* Inform 'this' that it was computed, and contains a valid result. */
   void set_first_execution() { if (first_execution) first_execution= FALSE; }
-  bool is_expensive_processor(uchar *arg);
   bool expr_cache_is_needed(THD *thd);
   
+  int optimize(double *out_rows, double *cost);
   /* 
     Return the identifier that we could use to identify the subquery for the
     user.
@@ -485,16 +561,19 @@ class Item_allany_subselect :public Item_in_subselect
 {
 public:
   chooser_compare_func_creator func_creator;
-  Comp_creator *func;
   bool all;
 
   Item_allany_subselect(Item * left_expr, chooser_compare_func_creator fc,
                         st_select_lex *select_lex, bool all);
 
+  void cleanup();
   // only ALL subquery has upper not
   subs_type substype() { return all?ALL_SUBS:ANY_SUBS; }
-  trans_res select_transformer(JOIN *join);
+  bool select_transformer(JOIN *join);
+  void create_comp_func(bool invert) { func= func_creator(invert); }
   virtual void print(String *str, enum_query_type query_type);
+  bool is_maxmin_applicable(JOIN *join);
+  bool transform_into_max_min(JOIN *join);
 };
 
 
@@ -514,14 +593,15 @@ public:
                          INDEXSUBQUERY_ENGINE, HASH_SJ_ENGINE,
                          ROWID_MERGE_ENGINE, TABLE_SCAN_ENGINE};
 
-  subselect_engine(Item_subselect *si, select_result_interceptor *res)
-    :thd(0)
+  subselect_engine(THD *thd_arg, Item_subselect *si,
+                   select_result_interceptor *res)
   {
     result= res;
     item= si;
     res_type= STRING_RESULT;
     res_field_type= MYSQL_TYPE_VAR_STRING;
     maybe_null= 0;
+    set_thd(thd_arg);
   }
   virtual ~subselect_engine() {}; // to satisfy compiler
   virtual void cleanup()= 0;
@@ -563,9 +643,11 @@ public:
   virtual bool may_be_null() { return maybe_null; };
   virtual table_map upper_select_const_tables()= 0;
   static table_map calc_const_tables(TABLE_LIST *);
+  static table_map calc_const_tables(List<TABLE_LIST> &list);
   virtual void print(String *str, enum_query_type query_type)= 0;
   virtual bool change_result(Item_subselect *si,
-                             select_result_interceptor *result)= 0;
+                             select_result_interceptor *result,
+                             bool temp= FALSE)= 0;
   virtual bool no_tables()= 0;
   virtual bool is_executed() const { return FALSE; }
   /* Check if subquery produced any rows during last query execution */
@@ -580,12 +662,11 @@ protected:
 class subselect_single_select_engine: public subselect_engine
 {
   bool prepared; /* simple subselect is prepared */
-  bool optimized; /* simple subselect is optimized */
   bool executed; /* simple subselect is executed */
   st_select_lex *select_lex; /* corresponding select_lex */
   JOIN * join; /* corresponding JOIN structure */
 public:
-  subselect_single_select_engine(st_select_lex *select,
+  subselect_single_select_engine(THD *thd_arg, st_select_lex *select,
 				 select_result_interceptor *result,
 				 Item_subselect *item);
   void cleanup();
@@ -597,7 +678,9 @@ public:
   void exclude();
   table_map upper_select_const_tables();
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_result_interceptor *result);
+  bool change_result(Item_subselect *si,
+                     select_result_interceptor *result,
+                     bool temp);
   bool no_tables();
   bool may_be_null();
   bool is_executed() const { return executed; }
@@ -614,7 +697,7 @@ class subselect_union_engine: public subselect_engine
 {
   st_select_lex_unit *unit;  /* corresponding unit structure */
 public:
-  subselect_union_engine(st_select_lex_unit *u,
+  subselect_union_engine(THD *thd_arg, st_select_lex_unit *u,
 			 select_result_interceptor *result,
 			 Item_subselect *item);
   void cleanup();
@@ -626,7 +709,9 @@ public:
   void exclude();
   table_map upper_select_const_tables();
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_result_interceptor *result);
+  bool change_result(Item_subselect *si,
+                     select_result_interceptor *result,
+                     bool temp= FALSE);
   bool no_tables();
   bool is_executed() const;
   bool no_rows();
@@ -670,21 +755,21 @@ public:
   // constructor can assign THD because it will be called after JOIN::prepare
   subselect_uniquesubquery_engine(THD *thd_arg, st_join_table *tab_arg,
 				  Item_subselect *subs, Item *where)
-    :subselect_engine(subs, 0), tab(tab_arg), cond(where)
-  {
-    set_thd(thd_arg);
-  }
+    :subselect_engine(thd_arg, subs, 0), tab(tab_arg), cond(where)
+  {}
   ~subselect_uniquesubquery_engine();
   void cleanup();
   int prepare();
   void fix_length_and_dec(Item_cache** row);
   int exec();
   uint cols() { return 1; }
-  uint8 uncacheable() { return UNCACHEABLE_DEPENDENT; }
+  uint8 uncacheable() { return UNCACHEABLE_DEPENDENT_INJECTED; }
   void exclude();
   table_map upper_select_const_tables() { return 0; }
   virtual void print (String *str, enum_query_type query_type);
-  bool change_result(Item_subselect *si, select_result_interceptor *result);
+  bool change_result(Item_subselect *si,
+                     select_result_interceptor *result,
+                     bool temp= FALSE);
   bool no_tables();
   int index_lookup(); /* TIMOUR: this method needs refactoring. */
   int scan_table();
@@ -774,7 +859,7 @@ inline bool Item_subselect::is_uncacheable() const
 
 class subselect_hash_sj_engine : public subselect_engine
 {
-protected:
+public:
   /* The table into which the subquery is materialized. */
   TABLE *tmp_table;
   /* TRUE if the subquery was materialized into a temp table. */
@@ -786,66 +871,34 @@ protected:
     of subselect_single_select_engine::[prepare | cols].
   */
   subselect_single_select_engine *materialize_engine;
-  /* The engine used to compute the IN predicate. */
-  subselect_engine *lookup_engine;
   /*
     QEP to execute the subquery and materialize its result into a
     temporary table. Created during the first call to exec().
   */
   JOIN *materialize_join;
-
-  /* Keyparts of the only non-NULL composite index in a rowid merge. */
-  MY_BITMAP non_null_key_parts;
-  /* Keyparts of the single column indexes with NULL, one keypart per index. */
-  MY_BITMAP partial_match_key_parts;
-  uint count_partial_match_columns;
-  uint count_null_only_columns;
   /*
     A conjunction of all the equality condtions between all pairs of expressions
     that are arguments of an IN predicate. We need these to post-filter some
     IN results because index lookups sometimes match values that are actually
     not equal to the search key in SQL terms.
- */
+  */
   Item_cond_and *semi_join_conds;
-  /* Possible execution strategies that can be used to compute hash semi-join.*/
-  enum exec_strategy {
-    UNDEFINED,
-    COMPLETE_MATCH, /* Use regular index lookups. */
-    PARTIAL_MATCH,  /* Use some partial matching strategy. */
-    PARTIAL_MATCH_MERGE, /* Use partial matching through index merging. */
-    PARTIAL_MATCH_SCAN,  /* Use partial matching through table scan. */
-    IMPOSSIBLE      /* Subquery materialization is not applicable. */
-  };
-  /* The chosen execution strategy. Computed after materialization. */
-  exec_strategy strategy;
-protected:
-  exec_strategy get_strategy_using_schema();
-  exec_strategy get_strategy_using_data();
-  size_t rowid_merge_buff_size(bool has_non_null_key,
-                               bool has_covering_null_row,
-                               MY_BITMAP *partial_match_key_parts);
-  void choose_partial_match_strategy(bool has_non_null_key,
-                                     bool has_covering_null_row,
-                                     MY_BITMAP *partial_match_key_parts);
-  bool make_semi_join_conds();
-  subselect_uniquesubquery_engine* make_unique_engine();
+  Name_resolution_context *semi_join_conds_context;
+
 
-public:
   subselect_hash_sj_engine(THD *thd, Item_subselect *in_predicate,
                            subselect_single_select_engine *old_engine)
-    :subselect_engine(in_predicate, NULL), tmp_table(NULL),
-    is_materialized(FALSE), materialize_engine(old_engine), lookup_engine(NULL),
-    materialize_join(NULL), count_partial_match_columns(0),
-    count_null_only_columns(0), semi_join_conds(NULL), strategy(UNDEFINED)
-  {
-    set_thd(thd);
-  }
+    : subselect_engine(thd, in_predicate, NULL), 
+      tmp_table(NULL), is_materialized(FALSE), materialize_engine(old_engine),
+      materialize_join(NULL),  semi_join_conds(NULL), lookup_engine(NULL),
+      count_partial_match_columns(0), count_null_only_columns(0),
+      strategy(UNDEFINED)
+  {}
   ~subselect_hash_sj_engine();
 
-  bool init_permanent(List<Item> *tmp_columns);
-  bool init_runtime();
+  bool init(List<Item> *tmp_columns, uint subquery_id);
   void cleanup();
-  int prepare() { return 0; } /* Override virtual function in base class. */
+  int prepare();
   int exec();
   virtual void print(String *str, enum_query_type query_type);
   uint cols()
@@ -863,8 +916,42 @@ public:
   void fix_length_and_dec(Item_cache** row);//=>base class
   void exclude(); //=>base class
   //=>base class
-  bool change_result(Item_subselect *si, select_result_interceptor *result);
+  bool change_result(Item_subselect *si,
+                     select_result_interceptor *result,
+                     bool temp= FALSE);
   bool no_tables();//=>base class
+
+protected:
+  /* The engine used to compute the IN predicate. */
+  subselect_engine *lookup_engine;
+  /* Keyparts of the only non-NULL composite index in a rowid merge. */
+  MY_BITMAP non_null_key_parts;
+  /* Keyparts of the single column indexes with NULL, one keypart per index. */
+  MY_BITMAP partial_match_key_parts;
+  uint count_partial_match_columns;
+  uint count_null_only_columns;
+  /* Possible execution strategies that can be used to compute hash semi-join.*/
+  enum exec_strategy {
+    UNDEFINED,
+    COMPLETE_MATCH, /* Use regular index lookups. */
+    PARTIAL_MATCH,  /* Use some partial matching strategy. */
+    PARTIAL_MATCH_MERGE, /* Use partial matching through index merging. */
+    PARTIAL_MATCH_SCAN,  /* Use partial matching through table scan. */
+    IMPOSSIBLE      /* Subquery materialization is not applicable. */
+  };
+  /* The chosen execution strategy. Computed after materialization. */
+  exec_strategy strategy;
+  exec_strategy get_strategy_using_schema();
+  exec_strategy get_strategy_using_data();
+  ulonglong rowid_merge_buff_size(bool has_non_null_key,
+                                  bool has_covering_null_row,
+                                  MY_BITMAP *partial_match_key_parts);
+  void choose_partial_match_strategy(bool has_non_null_key,
+                                     bool has_covering_null_row,
+                                     MY_BITMAP *partial_match_key_parts);
+  bool make_semi_join_conds();
+  subselect_uniquesubquery_engine* make_unique_engine();
+
 };
 
 
@@ -1033,7 +1120,7 @@ public:
 
   void set_null(rownum_t row_num)
   {
-    bitmap_set_bit(&null_key, row_num);
+    bitmap_set_bit(&null_key, (uint)row_num);
   }
   bool is_null(rownum_t row_num)
   {
@@ -1049,7 +1136,7 @@ public:
     }
     if (row_num > max_null_row || row_num < min_null_row)
       return FALSE;
-    return bitmap_is_set(&null_key, row_num);
+    return bitmap_is_set(&null_key, (uint)row_num);
   }
   void print(String *str);
 };
@@ -1069,19 +1156,28 @@ protected:
   /* A list of equalities between each pair of IN operands. */
   List<Item> *equi_join_conds;
   /*
-    If there is a row, such that all its NULL-able components are NULL, this
-    member is set to the number of covered columns. If there is no covering
-    row, then this is 0.
+    True if there is an all NULL row in tmp_table. If so, then if there is
+    no complete match, there is a guaranteed partial match.
   */
-  uint covering_null_row_width;
+  bool has_covering_null_row;
+
+  /*
+    True if all nullable columns of tmp_table consist of only NULL values.
+    If so, then if there is a match in the non-null columns, there is a
+    guaranteed partial match.
+  */
+  bool has_covering_null_columns;
+
 protected:
   virtual bool partial_match()= 0;
 public:
-  subselect_partial_match_engine(subselect_uniquesubquery_engine *engine_arg,
+  subselect_partial_match_engine(THD *thd_arg,
+                                 subselect_uniquesubquery_engine *engine_arg,
                                  TABLE *tmp_table_arg, Item_subselect *item_arg,
                                  select_result_interceptor *result_arg,
                                  List<Item> *equi_join_conds_arg,
-                                 uint covering_null_row_width_arg);
+                                 bool has_covering_null_row_arg,
+                                 bool has_covering_null_columns_arg);
   int prepare() { return 0; }
   int exec();
   void fix_length_and_dec(Item_cache**) {}
@@ -1089,7 +1185,9 @@ public:
   uint8 uncacheable() { return UNCACHEABLE_DEPENDENT; }
   void exclude() {}
   table_map upper_select_const_tables() { return 0; }
-  bool change_result(Item_subselect*, select_result_interceptor*)
+  bool change_result(Item_subselect*,
+                     select_result_interceptor*,
+                     bool temp= FALSE)
   { DBUG_ASSERT(FALSE); return false; }
   bool no_tables() { return false; }
   bool no_rows()
@@ -1127,11 +1225,6 @@ protected:
   */
   MY_BITMAP matching_outer_cols;
   /*
-    Columns that consist of only NULLs. Such columns match any value.
-    Computed once per query execution.
-  */
-  MY_BITMAP null_only_columns;
-  /*
     Indexes of row numbers, sorted by <column_value, row_number>. If an
     index may contain NULLs, the NULLs are stored efficiently in a bitmap.
 
@@ -1140,13 +1233,13 @@ protected:
     non-NULL columns, it is contained in keys[0].
   */
   Ordered_key **merge_keys;
-  /* The number of elements in keys. */
-  uint keys_count;
+  /* The number of elements in merge_keys. */
+  uint merge_keys_count;
   /*
     An index on all non-NULL columns of 'tmp_table'. The index has the
     logical form: <[v_i1 | ... | v_ik], rownum>. It allows to find the row
     number where the columns c_i1,...,c1_k contain the values v_i1,...,v_ik.
-    If such an index exists, it is always the first element of 'keys'.
+    If such an index exists, it is always the first element of 'merge_keys'.
   */
   Ordered_key *non_null_key;
   /*
@@ -1169,19 +1262,20 @@ protected:
   bool test_null_row(rownum_t row_num);
   bool partial_match();
 public:
-  subselect_rowid_merge_engine(subselect_uniquesubquery_engine *engine_arg,
-                               TABLE *tmp_table_arg, uint keys_count_arg,
-                               uint covering_null_row_width_arg,
+  subselect_rowid_merge_engine(THD *thd_arg,
+                               subselect_uniquesubquery_engine *engine_arg,
+                               TABLE *tmp_table_arg, uint merge_keys_count_arg,
+                               bool has_covering_null_row_arg,
+                               bool has_covering_null_columns_arg,
                                Item_subselect *item_arg,
                                select_result_interceptor *result_arg,
                                List<Item> *equi_join_conds_arg)
-    :subselect_partial_match_engine(engine_arg, tmp_table_arg, item_arg,
-                                    result_arg, equi_join_conds_arg,
-                                    covering_null_row_width_arg),
-    keys_count(keys_count_arg), non_null_key(NULL)
-  {
-    thd= lookup_engine->get_thd();
-  }
+    :subselect_partial_match_engine(thd_arg, engine_arg, tmp_table_arg,
+                                    item_arg, result_arg, equi_join_conds_arg,
+                                    has_covering_null_row_arg,
+                                    has_covering_null_columns_arg),
+    merge_keys_count(merge_keys_count_arg), non_null_key(NULL)
+  {}
   ~subselect_rowid_merge_engine();
   bool init(MY_BITMAP *non_null_key_parts, MY_BITMAP *partial_match_key_parts);
   void cleanup();
@@ -1194,11 +1288,13 @@ class subselect_table_scan_engine: public subselect_partial_match_engine
 protected:
   bool partial_match();
 public:
-  subselect_table_scan_engine(subselect_uniquesubquery_engine *engine_arg,
+  subselect_table_scan_engine(THD *thd_arg,
+                              subselect_uniquesubquery_engine *engine_arg,
                               TABLE *tmp_table_arg, Item_subselect *item_arg,
                               select_result_interceptor *result_arg,
                               List<Item> *equi_join_conds_arg,
-                              uint covering_null_row_width_arg);
+                              bool has_covering_null_row_arg,
+                              bool has_covering_null_columns_arg);
   void cleanup();
   virtual enum_engine_type engine_type() { return TABLE_SCAN_ENGINE; }
 };
diff --git a/sql/item_sum.cc b/sql/item_sum.cc
index c3835c7536c..d60eb1415b9 100644
--- a/sql/item_sum.cc
+++ b/sql/item_sum.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -331,7 +331,6 @@ bool Item_sum::register_sum_func(THD *thd, Item **ref)
   if (aggr_level >= 0)
   {
     ref_by= ref;
-    thd->lex->current_select->register_dependency_item(aggr_sel, ref);
     /* Add the object to the list of registered objects assigned to aggr_sel */
     if (!aggr_sel->inner_sum_func_list)
       next= this;
@@ -368,6 +367,16 @@ bool Item_sum::register_sum_func(THD *thd, Item **ref)
 }
 
 
+bool Item_sum::collect_outer_ref_processor(uchar *param)
+{
+  Collect_deps_prm *prm= (Collect_deps_prm *)param;
+  SELECT_LEX *ds;
+  if ((ds= depended_from()) && ds->nest_level < prm->nest_level)
+    prm->parameters->add_unique(this, &cmp_items);
+  return FALSE;
+}
+
+
 Item_sum::Item_sum(List<Item> &list) :arg_count(list.elements), 
   forced_const(FALSE)
 {
@@ -432,6 +441,7 @@ void Item_sum::mark_as_sum_func()
   cur_select->n_sum_items++;
   cur_select->with_sum_func= 1;
   with_sum_func= 1;
+  with_field= 0;
 }
 
 
@@ -498,7 +508,7 @@ bool Item_sum::walk (Item_processor processor, bool walk_subquery,
 Field *Item_sum::create_tmp_field(bool group, TABLE *table,
                                   uint convert_blob_length)
 {
-  Field *field;
+  Field *UNINIT_VAR(field);
   switch (result_type()) {
   case REAL_RESULT:
     field= new Field_double(max_length, maybe_null, name, decimals, TRUE);
@@ -518,7 +528,8 @@ Field *Item_sum::create_tmp_field(bool group, TABLE *table,
     field= Field_new_decimal::create_from_item(this);
     break;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     // This case should never be choosen
     DBUG_ASSERT(0);
     return 0;
@@ -543,7 +554,7 @@ void Item_sum::update_used_tables ()
     used_tables_cache&= PSEUDO_TABLE_BITS;
 
     /* the aggregate function is aggregated into its local context */
-    used_tables_cache |=  (1 << aggr_sel->join->tables) - 1;
+    used_tables_cache |=  (1 << aggr_sel->join->table_count) - 1;
   }
 }
 
@@ -985,7 +996,7 @@ bool Aggregator_distinct::add()
       */
       return tree->unique_add(table->record[0] + table->s->null_bytes);
     }
-    if ((error= table->file->ha_write_row(table->record[0])) &&
+    if ((error= table->file->ha_write_tmp_row(table->record[0])) &&
         table->file->is_fatal_error(error, HA_CHECK_DUP))
       return TRUE;
     return FALSE;
@@ -1154,7 +1165,8 @@ Item_sum_hybrid::fix_fields(THD *thd, Item **ref)
     max_length= float_length(decimals);
     break;
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   };
   setup_hybrid(args[0], NULL);
@@ -1189,17 +1201,31 @@ Item_sum_hybrid::fix_fields(THD *thd, Item **ref)
     Setup cache/comparator of MIN/MAX functions. When called by the
     copy_or_same function value_arg parameter contains calculated value
     of the original MIN/MAX object and it is saved in this object's cache.
+
+    We mark the value and arg_cache with 'RAND_TABLE_BIT' to ensure
+    that Arg_comparator::compare_datetime() doesn't allocate new
+    item inside of Arg_comparator.  This would cause compare_datetime()
+    and Item_sum_min::add() to use different values!
 */
 
 void Item_sum_hybrid::setup_hybrid(Item *item, Item *value_arg)
 {
-  value= Item_cache::get_cache(item);
+  if (!(value= Item_cache::get_cache(item)))
+    return;
   value->setup(item);
   value->store(value_arg);
-  arg_cache= Item_cache::get_cache(item);
+  /* Don't cache value, as it will change */
+  if (!item->const_item())
+    value->set_used_tables(RAND_TABLE_BIT);
+  if (!(arg_cache= Item_cache::get_cache(item, item->cmp_type())))
+    return;
   arg_cache->setup(item);
+  /* Don't cache value, as it will change */
+  if (!item->const_item())
+    arg_cache->set_used_tables(RAND_TABLE_BIT);
   cmp= new Arg_comparator();
-  cmp->set_cmp_func(this, (Item**)&arg_cache, (Item**)&value, FALSE);
+  if (cmp)
+    cmp->set_cmp_func(this, (Item**)&arg_cache, (Item**)&value, FALSE);
   collation.set(item->collation);
 }
 
@@ -1224,14 +1250,17 @@ Field *Item_sum_hybrid::create_tmp_field(bool group, TABLE *table,
   */
   switch (args[0]->field_type()) {
   case MYSQL_TYPE_DATE:
-    field= new Field_newdate(maybe_null, name, collation.collation);
+    field= new Field_newdate(0, maybe_null ? (uchar*)"" : 0, 0, Field::NONE,
+                             name, collation.collation);
     break;
   case MYSQL_TYPE_TIME:
-    field= new Field_time(maybe_null, name, collation.collation);
+    field= new_Field_time(0, maybe_null ? (uchar*)"" : 0, 0, Field::NONE,
+                          name, decimals, collation.collation);
     break;
   case MYSQL_TYPE_TIMESTAMP:
   case MYSQL_TYPE_DATETIME:
-    field= new Field_datetime(maybe_null, name, collation.collation);
+    field= new_Field_datetime(0, maybe_null ? (uchar*)"" : 0, 0, Field::NONE,
+                              name, decimals, collation.collation);
     break;
   default:
     return Item_sum::create_tmp_field(group, table, convert_blob_length);
@@ -1290,13 +1319,14 @@ void Item_sum_sum::fix_length_and_dec()
   DBUG_ENTER("Item_sum_sum::fix_length_and_dec");
   maybe_null=null_value=1;
   decimals= args[0]->decimals;
-  switch (args[0]->result_type()) {
+  switch (args[0]->cast_to_int_type()) {
   case REAL_RESULT:
   case STRING_RESULT:
     hybrid_type= REAL_RESULT;
     sum= 0.0;
     break;
   case INT_RESULT:
+  case TIME_RESULT:
   case DECIMAL_RESULT:
   {
     /* SUM result can't be longer than length(arg) + length(MAX_ROWS) */
@@ -1310,7 +1340,7 @@ void Item_sum_sum::fix_length_and_dec()
     break;
   }
   case ROW_RESULT:
-  default:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   DBUG_PRINT("info", ("Type: %s (%d, %d)",
@@ -1747,7 +1777,8 @@ void Item_sum_variance::fix_length_and_dec()
     break;
   }
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
   DBUG_PRINT("info", ("Type: REAL_RESULT (%d, %d)", max_length, (int)decimals));
@@ -2184,7 +2215,8 @@ void Item_sum_hybrid::reset_field()
     break;
   }
   case ROW_RESULT:
-  default:
+  case TIME_RESULT:
+  case IMPOSSIBLE_RESULT:
     DBUG_ASSERT(0);
   }
 }
diff --git a/sql/item_sum.h b/sql/item_sum.h
index cf5a08ca271..cd5ce3df9fc 100644
--- a/sql/item_sum.h
+++ b/sql/item_sum.h
@@ -1,6 +1,5 @@
 #ifndef ITEM_SUM_INCLUDED
 #define ITEM_SUM_INCLUDED
-
 /* Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved.
 
    This program is free software; you can redistribute it and/or modify
@@ -486,6 +485,7 @@ public:
   virtual Field *create_tmp_field(bool group, TABLE *table,
                                   uint convert_blob_length);
   bool walk(Item_processor processor, bool walk_subquery, uchar *argument);
+  virtual bool collect_outer_ref_processor(uchar *param);
   bool init_sum_func_check(THD *thd);
   bool check_sum_func(THD *thd, Item **ref);
   bool register_sum_func(THD *thd, Item **ref);
@@ -546,6 +546,7 @@ public:
   {
     return trace_unsupported_by_check_vcol_func_processor(func_name()); 
   }
+  bool clear_sum_processor(uchar *arg) { clear(); return 0; }
 };
 
 
@@ -1061,11 +1062,6 @@ protected:
   void restore_to_before_no_rows_in_result();
   Field *create_tmp_field(bool group, TABLE *table,
 			  uint convert_blob_length);
-  /*
-    MIN/MAX uses Item_cache_datetime for storing DATETIME values, thus
-    in this case a correct INT value can be provided.
-  */
-  bool result_as_longlong() { return args[0]->result_as_longlong(); }
 };
 
 
@@ -1456,8 +1452,13 @@ public:
   void make_unique();
   double val_real()
   {
-    String *res;  res=val_str(&str_value);
-    return res ? my_atof(res->c_ptr()) : 0.0;
+    int error;
+    const char *end;
+    String *res;
+    if (!(res= val_str(&str_value)))
+      return 0.0;
+    end= res->ptr() + res->length();
+    return (my_strtod(res->ptr(), (char**) &end, &error));
   }
   longlong val_int()
   {
diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc
index 5aa4db219e8..2bda3bbab00 100644
--- a/sql/item_timefunc.cc
+++ b/sql/item_timefunc.cc
@@ -1,4 +1,5 @@
 /* Copyright (C) 2000-2003 MySQL AB
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -39,14 +40,13 @@
 #include "sql_locale.h"          // MY_LOCALE my_locale_en_US
 #include "strfunc.h"             // check_word
 #include "sql_time.h"            // make_truncated_value_warning,
-                                 // make_time, get_date_from_daynr,
+                                 // get_date_from_daynr,
                                  // calc_weekday, calc_week,
                                  // convert_month_to_period,
                                  // convert_period_to_month,
-                                 // TIME_to_timestamp, make_date,
+                                 // TIME_to_timestamp,
                                  // calc_time_diff,
                                  // calc_time_from_sec,
-                                 // known_date_time_format,
                                  // get_date_time_format_str
 #include "tztime.h"              // struct Time_zone
 #include "sql_class.h"           // THD
@@ -56,193 +56,6 @@
 /** Day number for Dec 31st, 9999. */
 #define MAX_DAY_NUMBER 3652424L
 
-/**
-  @todo
-  OPTIMIZATION
-  - Replace the switch with a function that should be called for each
-  date type.
-  - Remove sprintf and opencode the conversion, like we do in
-  Field_datetime.
-
-  The reason for this functions existence is that as we don't have a
-  way to know if a datetime/time value has microseconds in them
-  we are now only adding microseconds to the output if the
-  value has microseconds.
-
-  We can't use a standard make_date_time() for this as we don't know
-  if someone will use %f in the format specifier in which case we would get
-  the microseconds twice.
-*/
-
-static bool make_datetime(date_time_format_types format, MYSQL_TIME *ltime,
-			  String *str)
-{
-  char *buff;
-  CHARSET_INFO *cs= &my_charset_numeric;
-  uint length= MAX_DATE_STRING_REP_LENGTH;
-
-  if (str->alloc(length))
-    return 1;
-  buff= (char*) str->ptr();
-
-  switch (format) {
-  case TIME_ONLY:
-    length= cs->cset->snprintf(cs, buff, length, "%s%02d:%02d:%02d",
-			       ltime->neg ? "-" : "",
-			       ltime->hour, ltime->minute, ltime->second);
-    break;
-  case TIME_MICROSECOND:
-    length= cs->cset->snprintf(cs, buff, length, "%s%02d:%02d:%02d.%06ld",
-			       ltime->neg ? "-" : "",
-			       ltime->hour, ltime->minute, ltime->second,
-			       ltime->second_part);
-    break;
-  case DATE_ONLY:
-    length= cs->cset->snprintf(cs, buff, length, "%04d-%02d-%02d",
-			       ltime->year, ltime->month, ltime->day);
-    break;
-  case DATE_TIME:
-    length= cs->cset->snprintf(cs, buff, length,
-			       "%04d-%02d-%02d %02d:%02d:%02d",
-			       ltime->year, ltime->month, ltime->day,
-			       ltime->hour, ltime->minute, ltime->second);
-    break;
-  case DATE_TIME_MICROSECOND:
-    length= cs->cset->snprintf(cs, buff, length,
-			       "%04d-%02d-%02d %02d:%02d:%02d.%06ld",
-			       ltime->year, ltime->month, ltime->day,
-			       ltime->hour, ltime->minute, ltime->second,
-			       ltime->second_part);
-    break;
-  }
-
-  str->length(length);
-  str->set_charset(cs);
-  return 0;
-}
-
-
-/*
-  Wrapper over make_datetime() with validation of the input MYSQL_TIME value
-
-  NOTE
-    see make_datetime() for more information
-
-  RETURN
-    1    if there was an error during converion
-    0    otherwise
-*/
-
-static bool make_datetime_with_warn(date_time_format_types format, MYSQL_TIME *ltime,
-                                    String *str)
-{
-  int warning= 0;
-
-  if (make_datetime(format, ltime, str))
-    return 1;
-  if (check_time_range(ltime, &warning))
-    return 1;
-  if (!warning)
-    return 0;
-
-  make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                               str->ptr(), str->length(),
-                               MYSQL_TIMESTAMP_TIME, NullS);
-  return make_datetime(format, ltime, str);
-}
-
-
-/*
-  Wrapper over make_time() with validation of the input MYSQL_TIME value
-
-  NOTE
-    see make_time() for more info
-
-  RETURN
-    1    if there was an error during conversion
-    0    otherwise
-*/
-
-static bool make_time_with_warn(const DATE_TIME_FORMAT *format,
-                                MYSQL_TIME *l_time, String *str)
-{
-  int warning= 0;
-  make_time(format, l_time, str);
-  if (check_time_range(l_time, &warning))
-    return 1;
-  if (warning)
-  {
-    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                 str->ptr(), str->length(),
-                                 MYSQL_TIMESTAMP_TIME, NullS);
-    make_time(format, l_time, str);
-  }
-
-  return 0;
-}
-
-
-/*
-  Convert seconds to MYSQL_TIME value with overflow checking
-
-  SYNOPSIS:
-    sec_to_time()
-    seconds          number of seconds
-    unsigned_flag    1, if 'seconds' is unsigned, 0, otherwise
-    ltime            output MYSQL_TIME value
-
-  DESCRIPTION
-    If the 'seconds' argument is inside MYSQL_TIME data range, convert it to a
-    corresponding value.
-    Otherwise, truncate the resulting value to the nearest endpoint, and
-    produce a warning message.
-
-  RETURN
-    1                if the value was truncated during conversion
-    0                otherwise
-*/
-  
-static bool sec_to_time(longlong seconds, bool unsigned_flag, MYSQL_TIME *ltime)
-{
-  uint sec;
-
-  bzero((char *)ltime, sizeof(*ltime));
-  
-  if (seconds < 0)
-  {
-    if (unsigned_flag)
-      goto overflow;
-    ltime->neg= 1;
-    if (seconds < -3020399)
-      goto overflow;
-    seconds= -seconds;
-  }
-  else if (seconds > 3020399)
-    goto overflow;
-  
-  sec= (uint) ((ulonglong) seconds % 3600);
-  ltime->hour= (uint) (seconds/3600);
-  ltime->minute= sec/60;
-  ltime->second= sec % 60;
-
-  return 0;
-
-overflow:
-  ltime->hour= TIME_MAX_HOUR;
-  ltime->minute= TIME_MAX_MINUTE;
-  ltime->second= TIME_MAX_SECOND;
-
-  char buf[22];
-  int len= (int)(longlong10_to_str(seconds, buf, unsigned_flag ? 10 : -10)
-                 - buf);
-  make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                               buf, len, MYSQL_TIMESTAMP_TIME,
-                               NullS);
-  
-  return 1;
-}
-
-
 /*
   Date formats corresponding to compound %r and %T conversion specifiers
 
@@ -291,7 +104,8 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
 			      const char *val, uint length, MYSQL_TIME *l_time,
                               timestamp_type cached_timestamp_type,
                               const char **sub_pattern_end,
-                              const char *date_time_type)
+                              const char *date_time_type,
+                              uint fuzzy_date)
 {
   int weekday= 0, yearday= 0, daypart= 0;
   int week_number= -1;
@@ -312,6 +126,8 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
   if (!sub_pattern_end)
     bzero((char*) l_time, sizeof(*l_time));
 
+  l_time->time_type= cached_timestamp_type;
+
   for (; ptr != end && val != val_end; ptr++)
   {
     /* Skip pre-space between each argument */
@@ -485,7 +301,7 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
         */
         if (extract_date_time(&time_ampm_format, val,
                               (uint)(val_end - val), l_time,
-                              cached_timestamp_type, &val, "time"))
+                              cached_timestamp_type, &val, "time", fuzzy_date))
           DBUG_RETURN(1);
         break;
 
@@ -493,7 +309,7 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
       case 'T':
         if (extract_date_time(&time_24hrs_format, val,
                               (uint)(val_end - val), l_time,
-                              cached_timestamp_type, &val, "time"))
+                              cached_timestamp_type, &val, "time", fuzzy_date))
           DBUG_RETURN(1);
         break;
 
@@ -600,6 +416,10 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
       l_time->minute > 59 || l_time->second > 59)
     goto err;
 
+  if ((fuzzy_date & TIME_NO_ZERO_DATE) &&
+       (l_time->year == 0 || l_time->month == 0 || l_time->day == 0))
+    goto err;
+
   if (val != val_end)
   {
     do
@@ -776,8 +596,7 @@ bool make_date_time(DATE_TIME_FORMAT *format, MYSQL_TIME *l_time,
 	str->append(hours_i < 12 ? "AM" : "PM",2);
 	break;
       case 'r':
-	length= sprintf(intbuff, 
-		    ((l_time->hour % 24) < 12) ?
+	length= sprintf(intbuff, ((l_time->hour % 24) < 12) ?
                     "%02d:%02d:%02d AM" : "%02d:%02d:%02d PM",
 		    (l_time->hour+11)%12+1,
 		    l_time->minute,
@@ -790,11 +609,8 @@ bool make_date_time(DATE_TIME_FORMAT *format, MYSQL_TIME *l_time,
 	str->append_with_prefill(intbuff, length, 2, '0');
 	break;
       case 'T':
-	length= sprintf(intbuff, 
-		    "%02d:%02d:%02d", 
-		    l_time->hour, 
-		    l_time->minute,
-		    l_time->second);
+	length= sprintf(intbuff, "%02d:%02d:%02d",
+		    l_time->hour, l_time->minute, l_time->second);
 	str->append(intbuff, length);
 	break;
       case 'U':
@@ -1098,7 +914,7 @@ longlong Item_func_dayofyear::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME ltime;
-  if (get_arg0_date(&ltime,TIME_NO_ZERO_DATE))
+  if (get_arg0_date(&ltime, TIME_NO_ZERO_IN_DATE | TIME_NO_ZERO_DATE))
     return 0;
   return (longlong) calc_daynr(ltime.year,ltime.month,ltime.day) -
     calc_daynr(ltime.year,1,1) + 1;
@@ -1364,21 +1180,23 @@ longlong Item_func_year::val_int_endpoint(bool left_endp, bool *incl_endp)
 }
 
 
-longlong Item_func_unix_timestamp::val_int()
+bool Item_func_unix_timestamp::get_timestamp_value(my_time_t *seconds,
+                                                   ulong *second_part)
 {
-  MYSQL_TIME ltime;
-  my_bool not_used;
-  
   DBUG_ASSERT(fixed == 1);
-  if (arg_count == 0)
-    return (longlong) current_thd->query_start();
   if (args[0]->type() == FIELD_ITEM)
   {						// Optimize timestamp field
     Field *field=((Item_field*) args[0])->field;
     if (field->type() == MYSQL_TYPE_TIMESTAMP)
-      return ((Field_timestamp*) field)->get_timestamp(&null_value);
+    {
+      if ((null_value= field->is_null()))
+        return 1;
+      *seconds= ((Field_timestamp*)field)->get_timestamp(second_part);
+      return 0;
+    }
   }
-  
+
+  MYSQL_TIME ltime;
   if (get_arg0_date(&ltime, 0))
   {
     /*
@@ -1387,12 +1205,42 @@ longlong Item_func_unix_timestamp::val_int()
       this case).
     */
     null_value= args[0]->null_value;
-    return 0;
+    return 1;
   }
+
+  uint error_code;
+  *seconds= TIME_to_timestamp(current_thd, &ltime, &error_code);
+  *second_part= ltime.second_part;
+  return (null_value= (error_code == ER_WARN_DATA_OUT_OF_RANGE));
+}
+
+
+longlong Item_func_unix_timestamp::int_op()
+{
+  if (arg_count == 0)
+    return (longlong) current_thd->query_start();
   
-  return (longlong) TIME_to_timestamp(current_thd, &ltime, &not_used);
+  ulong second_part;
+  my_time_t seconds;
+  if (get_timestamp_value(&seconds, &second_part))
+    return 0;
+
+  return seconds;
 }
 
+
+my_decimal *Item_func_unix_timestamp::decimal_op(my_decimal* buf)
+{
+  ulong second_part;
+  my_time_t seconds;
+  if (get_timestamp_value(&seconds, &second_part))
+    return 0;
+
+  return seconds2my_decimal(seconds < 0, seconds < 0 ? -seconds : seconds,
+                            second_part, buf);
+}
+
+
 enum_monotonicity_info Item_func_unix_timestamp::get_monotonicity_info() const
 {
   if (args[0]->type() == Item::FIELD_ITEM &&
@@ -1408,37 +1256,54 @@ longlong Item_func_unix_timestamp::val_int_endpoint(bool left_endp, bool *incl_e
   DBUG_ASSERT(arg_count == 1 &&
               args[0]->type() == Item::FIELD_ITEM &&
               args[0]->field_type() == MYSQL_TYPE_TIMESTAMP);
-  Field *field=((Item_field*) args[0])->field;
+  Field_timestamp *field=(Field_timestamp *)(((Item_field*)args[0])->field);
   /* Leave the incl_endp intact */
-  return ((Field_timestamp*) field)->get_timestamp(&null_value);
+  ulong unused;
+  my_time_t ts= field->get_timestamp(&unused);
+  null_value= field->is_null();
+  return ts;
 }
 
 
-longlong Item_func_time_to_sec::val_int()
+longlong Item_func_time_to_sec::int_op()
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME ltime;
-  longlong seconds;
-  (void) get_arg0_time(&ltime);
-  seconds=ltime.hour*3600L+ltime.minute*60+ltime.second;
+  if (get_arg0_time(&ltime))
+    return 0;
+
+  longlong seconds=ltime.hour*3600L+ltime.minute*60+ltime.second;
   return ltime.neg ? -seconds : seconds;
 }
 
 
+my_decimal *Item_func_time_to_sec::decimal_op(my_decimal* buf)
+{
+  DBUG_ASSERT(fixed == 1);
+  MYSQL_TIME ltime;
+  if (get_arg0_time(&ltime))
+    return 0;
+
+  longlong seconds= ltime.hour*3600L+ltime.minute*60+ltime.second;
+  return seconds2my_decimal(ltime.neg, seconds, ltime.second_part, buf);
+}
+
+
 /**
   Convert a string to a interval value.
 
   To make code easy, allow interval objects without separators.
 */
 
-bool get_interval_value(Item *args,interval_type int_type,
-			       String *str_value, INTERVAL *interval)
+bool get_interval_value(Item *args,interval_type int_type, INTERVAL *interval)
 {
   ulonglong array[5];
   longlong UNINIT_VAR(value);
   const char *UNINIT_VAR(str);
   size_t UNINIT_VAR(length);
-  CHARSET_INFO *cs=str_value->charset();
+  CHARSET_INFO *UNINIT_VAR(cs);
+  char buf[100];
+  String str_value(buf, sizeof(buf), &my_charset_bin);
 
   bzero((char*) interval,sizeof(*interval));
   if ((int) int_type <= INTERVAL_MICROSECOND)
@@ -1455,11 +1320,12 @@ bool get_interval_value(Item *args,interval_type int_type,
   else
   {
     String *res;
-    if (!(res=args->val_str(str_value)))
+    if (!(res=args->val_str(&str_value)))
       return (1);
 
     /* record negative intervalls in interval->neg */
     str=res->ptr();
+    cs= res->charset();
     const char *end=str+res->length();
     while (str != end && my_isspace(cs,*str))
       str++;
@@ -1583,71 +1449,84 @@ bool get_interval_value(Item *args,interval_type int_type,
 }
 
 
-String *Item_date::val_str(String *str)
+void Item_temporal_func::fix_length_and_dec()
+{ 
+  static const uint max_time_type_width[5]=
+  { MAX_DATETIME_WIDTH, MAX_DATETIME_WIDTH, MAX_DATE_WIDTH,
+    MAX_DATETIME_WIDTH, MIN_TIME_WIDTH };
+
+  maybe_null= true;
+  max_length= max_time_type_width[mysql_type_to_time_type(field_type())+2];
+  if (decimals)
+  {
+    if (decimals == NOT_FIXED_DEC)
+      max_length+= TIME_SECOND_PART_DIGITS + 1;
+    else
+    {
+      set_if_smaller(decimals, TIME_SECOND_PART_DIGITS);
+      max_length+= decimals + 1;
+    }
+  }
+  sql_mode= current_thd->variables.sql_mode &
+                 (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE);
+  collation.set(&my_charset_numeric, DERIVATION_NUMERIC, MY_REPERTOIRE_ASCII);
+}
+
+String *Item_temporal_func::val_str(String *str)
+{
+  DBUG_ASSERT(fixed == 1);
+  return val_string_from_date(str);
+}
+
+
+longlong Item_temporal_func::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME ltime;
-  if (get_date(&ltime, TIME_FUZZY_DATE))
-    return (String *) 0;
-  if (str->alloc(MAX_DATE_STRING_REP_LENGTH))
-  {
-    null_value= 1;
-    return (String *) 0;
-  }
-  make_date((DATE_TIME_FORMAT *) 0, &ltime, str);
-  return str;
+  if (get_date(&ltime, TIME_FUZZY_DATE | sql_mode))
+    return 0;
+  return (longlong)TIME_to_ulonglong(&ltime);
 }
 
 
-longlong Item_date::val_int()
+double Item_temporal_func::val_real()
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME ltime;
-  if (get_date(&ltime, TIME_FUZZY_DATE))
+  if (get_date(&ltime, TIME_FUZZY_DATE | sql_mode))
     return 0;
-  return (longlong) (ltime.year*10000L+ltime.month*100+ltime.day);
+  return TIME_to_double(&ltime);
 }
 
 
 bool Item_func_from_days::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   longlong value=args[0]->val_int();
-  if ((null_value=args[0]->null_value))
-    return 1;
+  if (args[0]->null_value)
+    return (null_value= 1);
+  if ((fuzzy_date & TIME_NO_ZERO_DATE) && value == 0)
+    return (null_value= 1);
   bzero(ltime, sizeof(MYSQL_TIME));
   get_date_from_daynr((long) value, &ltime->year, &ltime->month, &ltime->day);
 
-  if ((null_value= (fuzzy_date & TIME_NO_ZERO_DATE) &&
-       (ltime->year == 0 || ltime->month == 0 || ltime->day == 0)))
-    return TRUE;
+  if ((fuzzy_date & TIME_NO_ZERO_DATE) &&
+       (ltime->year == 0 || ltime->month == 0 || ltime->day == 0))
+    return (null_value= 1);
 
   ltime->time_type= MYSQL_TIMESTAMP_DATE;
-  return 0;
+  return (null_value= 0);
 }
 
 
 void Item_func_curdate::fix_length_and_dec()
 {
-  Item_date::fix_length_and_dec();
-
   store_now_in_TIME(&ltime);
   
   /* We don't need to set second_part and neg because they already 0 */
   ltime.hour= ltime.minute= ltime.second= 0;
   ltime.time_type= MYSQL_TIMESTAMP_DATE;
-  value= (longlong) TIME_to_ulonglong_date(&ltime);
-}
-
-String *Item_func_curdate::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  if (str->alloc(MAX_DATE_STRING_REP_LENGTH))
-  {
-    null_value= 1;
-    return (String *) 0;
-  }
-  make_date((DATE_TIME_FORMAT *) 0, &ltime, str);
-  return str;
+  Item_datefunc::fix_length_and_dec();
+  maybe_null= false;
 }
 
 /**
@@ -1657,8 +1536,7 @@ String *Item_func_curdate::val_str(String *str)
 void Item_func_curdate_local::store_now_in_TIME(MYSQL_TIME *now_time)
 {
   THD *thd= current_thd;
-  thd->variables.time_zone->gmt_sec_to_TIME(now_time, 
-                                             (my_time_t)thd->query_start());
+  thd->variables.time_zone->gmt_sec_to_TIME(now_time, thd->query_start());
   thd->time_zone_used= 1;
 }
 
@@ -1669,8 +1547,8 @@ void Item_func_curdate_local::store_now_in_TIME(MYSQL_TIME *now_time)
 */
 void Item_func_curdate_utc::store_now_in_TIME(MYSQL_TIME *now_time)
 {
-  my_tz_UTC->gmt_sec_to_TIME(now_time, 
-                             (my_time_t)(current_thd->query_start()));
+  THD *thd= current_thd;
+  my_tz_UTC->gmt_sec_to_TIME(now_time, thd->query_start());
   /* 
     We are not flagging this query as using time zone, since it uses fixed
     UTC-SYSTEM time-zone.
@@ -1686,25 +1564,35 @@ bool Item_func_curdate::get_date(MYSQL_TIME *res,
 }
 
 
-String *Item_func_curtime::val_str(String *str)
+bool Item_func_curtime::fix_fields(THD *thd, Item **items)
 {
-  DBUG_ASSERT(fixed == 1);
-  str_value.set(buff, buff_length, &my_charset_latin1);
-  return &str_value;
+  if (decimals > TIME_SECOND_PART_DIGITS)
+  {
+    my_error(ER_TOO_BIG_PRECISION, MYF(0), decimals, func_name(),
+             TIME_SECOND_PART_DIGITS);
+    return 1;
+  }
+  return Item_timefunc::fix_fields(thd, items);
 }
 
-
-void Item_func_curtime::fix_length_and_dec()
+bool Item_func_curtime::get_date(MYSQL_TIME *res,
+                                 uint fuzzy_date __attribute__((unused)))
 {
-  MYSQL_TIME ltime;
-
-  decimals= DATETIME_DEC;
-  store_now_in_TIME(&ltime);
-  value= TIME_to_ulonglong_time(&ltime);
-  buff_length= (uint) my_time_to_str(&ltime, buff);
-  fix_length_and_charset_datetime(buff_length);
+  *res= ltime;
+  return 0;
 }
 
+static void set_sec_part(ulong sec_part, MYSQL_TIME *ltime, Item *item)
+{
+  DBUG_ASSERT(item->decimals == AUTO_SEC_PART_DIGITS ||
+              item->decimals <= TIME_SECOND_PART_DIGITS);
+  if (item->decimals)
+  {
+    ltime->second_part= sec_part;
+    if (item->decimals < TIME_SECOND_PART_DIGITS)
+      ltime->second_part= sec_part_truncate(ltime->second_part, item->decimals);
+  }
+}
 
 /**
     Converts current time in my_time_t to MYSQL_TIME represenatation for local
@@ -1713,8 +1601,10 @@ void Item_func_curtime::fix_length_and_dec()
 void Item_func_curtime_local::store_now_in_TIME(MYSQL_TIME *now_time)
 {
   THD *thd= current_thd;
-  thd->variables.time_zone->gmt_sec_to_TIME(now_time, 
-                                             (my_time_t)thd->query_start());
+  thd->variables.time_zone->gmt_sec_to_TIME(now_time, thd->query_start());
+  now_time->year= now_time->month= now_time->day= 0;
+  now_time->time_type= MYSQL_TIMESTAMP_TIME;
+  set_sec_part(thd->query_start_sec_part(), now_time, this);
   thd->time_zone_used= 1;
 }
 
@@ -1725,35 +1615,28 @@ void Item_func_curtime_local::store_now_in_TIME(MYSQL_TIME *now_time)
 */
 void Item_func_curtime_utc::store_now_in_TIME(MYSQL_TIME *now_time)
 {
-  my_tz_UTC->gmt_sec_to_TIME(now_time, 
-                             (my_time_t)(current_thd->query_start()));
+  THD *thd= current_thd;
+  my_tz_UTC->gmt_sec_to_TIME(now_time, thd->query_start());
+  now_time->year= now_time->month= now_time->day= 0;
+  now_time->time_type= MYSQL_TIMESTAMP_TIME;
+  set_sec_part(thd->query_start_sec_part(), now_time, this);
   /* 
     We are not flagging this query as using time zone, since it uses fixed
     UTC-SYSTEM time-zone.
   */
 }
 
-
-String *Item_func_now::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  str_value.set(buff, buff_length, &my_charset_numeric);
-  return &str_value;
-}
-
-
-void Item_func_now::fix_length_and_dec()
+bool Item_func_now::fix_fields(THD *thd, Item **items)
 {
-  decimals= DATETIME_DEC;
-
-  store_now_in_TIME(&ltime);
-  value= (longlong) TIME_to_ulonglong_datetime(&ltime);
-
-  buff_length= (uint) my_datetime_to_str(&ltime, buff);
-  fix_length_and_charset_datetime(buff_length);
+  if (decimals > TIME_SECOND_PART_DIGITS)
+  {
+    my_error(ER_TOO_BIG_PRECISION, MYF(0), decimals, func_name(),
+             TIME_SECOND_PART_DIGITS);
+    return 1;
+  }
+  return Item_temporal_func::fix_fields(thd, items);
 }
 
-
 /**
     Converts current time in my_time_t to MYSQL_TIME represenatation for local
     time zone. Defines time zone (local) used for whole NOW function.
@@ -1761,8 +1644,8 @@ void Item_func_now::fix_length_and_dec()
 void Item_func_now_local::store_now_in_TIME(MYSQL_TIME *now_time)
 {
   THD *thd= current_thd;
-  thd->variables.time_zone->gmt_sec_to_TIME(now_time, 
-                                             (my_time_t)thd->query_start());
+  thd->variables.time_zone->gmt_sec_to_TIME(now_time, thd->query_start());
+  set_sec_part(thd->query_start_sec_part(), now_time, this);
   thd->time_zone_used= 1;
 }
 
@@ -1773,8 +1656,9 @@ void Item_func_now_local::store_now_in_TIME(MYSQL_TIME *now_time)
 */
 void Item_func_now_utc::store_now_in_TIME(MYSQL_TIME *now_time)
 {
-  my_tz_UTC->gmt_sec_to_TIME(now_time, 
-                             (my_time_t)(current_thd->query_start()));
+  THD *thd= current_thd;
+  my_tz_UTC->gmt_sec_to_TIME(now_time, thd->query_start());
+  set_sec_part(thd->query_start_sec_part(), now_time, this);
   /* 
     We are not flagging this query as using time zone, since it uses fixed
     UTC-SYSTEM time-zone.
@@ -1790,13 +1674,6 @@ bool Item_func_now::get_date(MYSQL_TIME *res,
 }
 
 
-int Item_func_now::save_in_field(Field *to, bool no_conversions)
-{
-  to->set_notnull();
-  return to->store_time(&ltime, MYSQL_TIMESTAMP_DATETIME);
-}
-
-
 /**
     Converts current time in my_time_t to MYSQL_TIME represenatation for local
     time zone. Defines time zone (local) used for whole SYSDATE function.
@@ -1804,97 +1681,61 @@ int Item_func_now::save_in_field(Field *to, bool no_conversions)
 void Item_func_sysdate_local::store_now_in_TIME(MYSQL_TIME *now_time)
 {
   THD *thd= current_thd;
-  thd->variables.time_zone->gmt_sec_to_TIME(now_time, (my_time_t) my_time(0));
+  my_hrtime_t now= my_hrtime();
+  thd->variables.time_zone->gmt_sec_to_TIME(now_time, hrtime_to_my_time(now));
+  set_sec_part(hrtime_sec_part(now), now_time, this);
   thd->time_zone_used= 1;
 }
 
 
-String *Item_func_sysdate_local::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  store_now_in_TIME(&ltime);
-  buff_length= (uint) my_datetime_to_str(&ltime, buff);
-  str_value.set(buff, buff_length, &my_charset_numeric);
-  return &str_value;
-}
-
-
-longlong Item_func_sysdate_local::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  store_now_in_TIME(&ltime);
-  return (longlong) TIME_to_ulonglong_datetime(&ltime);
-}
-
-
-double Item_func_sysdate_local::val_real()
-{
-  DBUG_ASSERT(fixed == 1);
-  store_now_in_TIME(&ltime);
-  return ulonglong2double(TIME_to_ulonglong_datetime(&ltime));
-}
-
-
-void Item_func_sysdate_local::fix_length_and_dec()
-{
-  decimals= 0;
-  fix_length_and_charset_datetime(MAX_DATETIME_WIDTH);
-}
-
-
 bool Item_func_sysdate_local::get_date(MYSQL_TIME *res,
                                        uint fuzzy_date __attribute__((unused)))
 {
-  store_now_in_TIME(&ltime);
-  *res= ltime;
+  store_now_in_TIME(res);
   return 0;
 }
 
-
-int Item_func_sysdate_local::save_in_field(Field *to, bool no_conversions)
-{
-  store_now_in_TIME(&ltime);
-  to->set_notnull();
-  to->store_time(&ltime, MYSQL_TIMESTAMP_DATETIME);
-  return 0;
-}
-
-
-String *Item_func_sec_to_time::val_str(String *str)
+bool Item_func_sec_to_time::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  longlong arg_val= args[0]->val_int(); 
+  bool sign;
+  ulonglong sec;
+  ulong sec_part;
 
-  if ((null_value=args[0]->null_value) ||
-      str->alloc(MAX_DATE_STRING_REP_LENGTH))
-  {
-    null_value= 1;
-    return (String*) 0;
-  }
+  bzero((char *)ltime, sizeof(*ltime));
+  ltime->time_type= MYSQL_TIMESTAMP_TIME;
 
-  sec_to_time(arg_val, args[0]->unsigned_flag, &ltime);
-  
-  make_time((DATE_TIME_FORMAT *) 0, &ltime, str);
-  return str;
-}
+  sign= args[0]->get_seconds(&sec, &sec_part);
+
+  if ((null_value= args[0]->null_value))
+    return 1;
 
+  ltime->neg= sign;
+  if (sec > TIME_MAX_VALUE_SECONDS)
+    goto overflow;
 
-longlong Item_func_sec_to_time::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  longlong arg_val= args[0]->val_int(); 
+  DBUG_ASSERT(sec_part <= TIME_MAX_SECOND_PART);
   
-  if ((null_value=args[0]->null_value))
-    return 0;
+  ltime->hour=   (uint) (sec/3600);
+  ltime->minute= (uint) (sec % 3600) /60;
+  ltime->second= (uint) sec % 60;
+  ltime->second_part= sec_part;
 
-  sec_to_time(arg_val, args[0]->unsigned_flag, &ltime);
+  return 0;
 
-  return (ltime.neg ? -1 : 1) *
-    (longlong) ((ltime.hour)*10000 + ltime.minute*100 + ltime.second);
-}
+overflow:
+  /* use check_time_range() to set ltime to the max value depending on dec */
+  int unused;
+  char buf[100];
+  String tmp(buf, sizeof(buf), &my_charset_bin), *err= args[0]->val_str(&tmp);
 
+  ltime->hour= TIME_MAX_HOUR+1;
+  check_time_range(ltime, decimals, &unused);
+  make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                               err->ptr(), err->length(),
+                               MYSQL_TIMESTAMP_TIME, NullS);
+  return 0;
+}
 
 void Item_func_date_format::fix_length_and_dec()
 {
@@ -2031,25 +1872,12 @@ String *Item_func_date_format::val_str(String *str)
   String *format;
   MYSQL_TIME l_time;
   uint size;
+  int is_time_flag = is_time_format ? TIME_TIME_ONLY : 0;
   DBUG_ASSERT(fixed == 1);
-
-  if (!is_time_format)
-  {
-    if (get_arg0_date(&l_time, TIME_FUZZY_DATE))
-      return 0;
-  }
-  else
-  {
-    String *res;
-    if (!(res=args[0]->val_str(str)) ||
-	(str_to_time_with_warn(res->charset(), res->ptr(), res->length(),
-	                       &l_time)))
-      goto null_date;
-
-    l_time.year=l_time.month=l_time.day=0;
-    null_value=0;
-  }
-
+  
+  if (get_arg0_date(&l_time, TIME_FUZZY_DATE | is_time_flag))
+    return 0;
+  
   if (!(format = args[1]->val_str(str)) || !format->length())
     goto null_date;
 
@@ -2087,98 +1915,39 @@ null_date:
 void Item_func_from_unixtime::fix_length_and_dec()
 { 
   thd= current_thd;
-  decimals= DATETIME_DEC;
-  fix_length_and_charset_datetime(MAX_DATETIME_WIDTH);
-  maybe_null= 1;
   thd->time_zone_used= 1;
+  decimals= args[0]->decimals;
+  Item_temporal_func::fix_length_and_dec();
 }
 
 
-String *Item_func_from_unixtime::val_str(String *str)
-{
-  MYSQL_TIME time_tmp;
-
-  DBUG_ASSERT(fixed == 1);
-
-  if (get_date(&time_tmp, 0))
-    return 0;
-
-  if (str->alloc(MAX_DATE_STRING_REP_LENGTH))
-  {
-    null_value= 1;
-    return 0;
-  }
-
-  make_datetime((DATE_TIME_FORMAT *) 0, &time_tmp, str);
-
-  return str;
-}
-
-
-longlong Item_func_from_unixtime::val_int()
-{
-  MYSQL_TIME time_tmp;
-
-  DBUG_ASSERT(fixed == 1);
-
-  if (get_date(&time_tmp, 0))
-    return 0;
-
-  return (longlong) TIME_to_ulonglong_datetime(&time_tmp);
-}
-
 bool Item_func_from_unixtime::get_date(MYSQL_TIME *ltime,
 				       uint fuzzy_date __attribute__((unused)))
 {
-  ulonglong tmp= (ulonglong)(args[0]->val_int());
-  /*
-    "tmp > TIMESTAMP_MAX_VALUE" check also covers case of negative
-    from_unixtime() argument since tmp is unsigned.
-  */
-  if ((null_value= (args[0]->null_value || tmp > TIMESTAMP_MAX_VALUE)))
-    return 1;
-
-  thd->variables.time_zone->gmt_sec_to_TIME(ltime, (my_time_t)tmp);
-
-  return 0;
-}
-
-
-void Item_func_convert_tz::fix_length_and_dec()
-{
-  decimals= 0;
-  fix_length_and_charset_datetime(MAX_DATETIME_WIDTH);
-  maybe_null= 1;
-}
+  bool sign;
+  ulonglong sec;
+  ulong sec_part;
 
+  bzero((char *)ltime, sizeof(*ltime));
+  ltime->time_type= MYSQL_TIMESTAMP_TIME;
 
-String *Item_func_convert_tz::val_str(String *str)
-{
-  MYSQL_TIME time_tmp;
+  sign= args[0]->get_seconds(&sec, &sec_part);
 
-  if (get_date(&time_tmp, 0))
-    return 0;
+  if (args[0]->null_value || sign || sec > TIMESTAMP_MAX_VALUE)
+    return (null_value= 1);
 
-  if (str->alloc(MAX_DATE_STRING_REP_LENGTH))
-  {
-    null_value= 1;
-    return 0;
-  }
+  thd->variables.time_zone->gmt_sec_to_TIME(ltime, (my_time_t)sec);
 
-  make_datetime((DATE_TIME_FORMAT *) 0, &time_tmp, str);
+  ltime->second_part= sec_part;
 
-  return str;
+  return (null_value= 0);
 }
 
 
-longlong Item_func_convert_tz::val_int()
+void Item_func_convert_tz::fix_length_and_dec()
 {
-  MYSQL_TIME time_tmp;
-
-  if (get_date(&time_tmp, 0))
-    return 0;
-  
-  return (longlong)TIME_to_ulonglong_datetime(&time_tmp);
+  decimals= args[0]->decimals;
+  Item_temporal_func::fix_length_and_dec();
 }
 
 
@@ -2201,29 +1970,29 @@ bool Item_func_convert_tz::get_date(MYSQL_TIME *ltime,
     to_tz_cached= args[2]->const_item();
   }
 
-  if (from_tz==0 || to_tz==0 || get_arg0_date(ltime, TIME_NO_ZERO_DATE))
-  {
-    null_value= 1;
-    return 1;
-  }
+  if (from_tz==0 || to_tz==0 ||
+      get_arg0_date(ltime, TIME_NO_ZERO_DATE | TIME_NO_ZERO_IN_DATE))
+    return (null_value= 1);
 
   {
-    my_bool not_used;
+    uint not_used;
     my_time_tmp= from_tz->TIME_to_gmt_sec(ltime, &not_used);
+    ulong sec_part= ltime->second_part;
     /* my_time_tmp is guranteed to be in the allowed range */
     if (my_time_tmp)
       to_tz->gmt_sec_to_TIME(ltime, my_time_tmp);
+    /* we rely on the fact that no timezone conversion can change sec_part */
+    ltime->second_part= sec_part;
   }
 
-  null_value= 0;
-  return 0;
+  return (null_value= 0);
 }
 
 
 void Item_func_convert_tz::cleanup()
 {
   from_tz_cached= to_tz_cached= 0;
-  Item_date_func::cleanup();
+  Item_temporal_func::cleanup();
 }
 
 
@@ -2231,18 +2000,20 @@ void Item_date_add_interval::fix_length_and_dec()
 {
   enum_field_types arg0_field_type;
 
-  maybe_null=1;
-
   /*
     The field type for the result of an Item_date function is defined as
     follows:
 
     - If first arg is a MYSQL_TYPE_DATETIME result is MYSQL_TYPE_DATETIME
     - If first arg is a MYSQL_TYPE_DATE and the interval type uses hours,
-      minutes or seconds then type is MYSQL_TYPE_DATETIME.
+      minutes or seconds then type is MYSQL_TYPE_DATETIME
+      otherwise it's MYSQL_TYPE_DATE
+    - if first arg is a MYSQL_TYPE_TIME and the interval type isn't using
+      anything larger than days, then the result is MYSQL_TYPE_TIME,
+      otherwise - MYSQL_TYPE_DATETIME.
     - Otherwise the result is MYSQL_TYPE_STRING
-      (This is because you can't know if the string contains a DATE, MYSQL_TIME or
-      DATETIME argument)
+      (This is because you can't know if the string contains a DATE,
+      MYSQL_TIME or DATETIME argument)
   */
   cached_field_type= MYSQL_TYPE_STRING;
   arg0_field_type= args[0]->field_type();
@@ -2256,21 +2027,19 @@ void Item_date_add_interval::fix_length_and_dec()
     else
       cached_field_type= MYSQL_TYPE_DATETIME;
   }
-
-  if (cached_field_type == MYSQL_TYPE_STRING)
+  else if (arg0_field_type == MYSQL_TYPE_TIME)
   {
-    /* Behave as a usual string function when return type is VARCHAR. */
-    fix_length_and_charset(MAX_DATETIME_FULL_WIDTH, default_charset());
+    if (int_type >= INTERVAL_DAY && int_type != INTERVAL_YEAR_MONTH)
+      cached_field_type= arg0_field_type;
+    else
+      cached_field_type= MYSQL_TYPE_DATETIME;
   }
+  if (int_type == INTERVAL_MICROSECOND || int_type >= INTERVAL_DAY_MICROSECOND)
+    decimals= 6;
   else
-  {
-    /*
-      Follow the "Number-to-string conversion" rules as in WorkLog 2649
-      when return type is DATE or DATETIME.
-    */
-    fix_length_and_charset_datetime(MAX_DATETIME_FULL_WIDTH);
-  }
-  value.alloc(max_length);
+    decimals= args[0]->decimals;
+
+  Item_temporal_func::fix_length_and_dec();
 }
 
 
@@ -2280,57 +2049,19 @@ bool Item_date_add_interval::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   INTERVAL interval;
 
-  if (args[0]->get_date(ltime, TIME_NO_ZERO_DATE) ||
-      get_interval_value(args[1], int_type, &value, &interval))
+  if (args[0]->get_date(ltime, TIME_NO_ZERO_DATE | TIME_FUZZY_DATE) ||
+      get_interval_value(args[1], int_type, &interval))
     return (null_value=1);
 
   if (date_sub_interval)
     interval.neg = !interval.neg;
 
-  if ((null_value= date_add_interval(ltime, int_type, interval)))
-    return 1;
-  return 0;
-}
-
-
-String *Item_date_add_interval::val_str_ascii(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  enum date_time_format_types format;
-
-  if (Item_date_add_interval::get_date(&ltime, TIME_NO_ZERO_DATE))
-    return 0;
-
-  if (ltime.time_type == MYSQL_TIMESTAMP_DATE)
-    format= DATE_ONLY;
-  else if (ltime.second_part)
-    format= DATE_TIME_MICROSECOND;
-  else
-    format= DATE_TIME;
-
-  if (!make_datetime(format, &ltime, str))
-    return str;
-
-  null_value=1;
-  return 0;
-}
-
-
-longlong Item_date_add_interval::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  longlong date;
-  if (Item_date_add_interval::get_date(&ltime, TIME_NO_ZERO_DATE))
-    return (longlong) 0;
-  date = (ltime.year*100L + ltime.month)*100L + ltime.day;
-  return ltime.time_type == MYSQL_TIMESTAMP_DATE ? date :
-    ((date*100L + ltime.hour)*100L+ ltime.minute)*100L + ltime.second;
+  if (date_add_interval(ltime, int_type, interval))
+    return (null_value=1);
+  return (null_value= 0);
 }
 
 
-
 bool Item_date_add_interval::eq(const Item *item, bool binary_cmp) const
 {
   Item_date_add_interval *other= (Item_date_add_interval*) item;
@@ -2412,27 +2143,12 @@ longlong Item_extract::val_int()
   uint year;
   ulong week_format;
   long neg;
-  if (date_value)
-  {
-    if (get_arg0_date(&ltime, TIME_FUZZY_DATE))
-      return 0;
-    neg=1;
-  }
-  else
-  {
-    char buf[40];
-    String value(buf, sizeof(buf), &my_charset_bin);;
-    String *res= args[0]->val_str(&value);
-    if (!res ||
-        str_to_time_with_warn(res->charset(), res->ptr(), res->length(),
-                              &ltime))
-    {
-      null_value=1;
-      return 0;
-    }
-    neg= ltime.neg ? -1 : 1;
-    null_value=0;
-  }
+  int is_time_flag = date_value ? 0 : TIME_TIME_ONLY;
+
+  if (get_arg0_date(&ltime, TIME_FUZZY_DATE | is_time_flag))
+    return 0;
+  neg= ltime.neg ? -1 : 1;
+
   switch (int_type) {
   case INTERVAL_YEAR:		return ltime.year;
   case INTERVAL_YEAR_MONTH:	return ltime.year*100L+ltime.month;
@@ -2515,12 +2231,19 @@ bool Item_char_typecast::eq(const Item *item, bool binary_cmp) const
   return 1;
 }
 
-void Item_typecast::print(String *str, enum_query_type query_type)
+void Item_temporal_typecast::print(String *str, enum_query_type query_type)
 {
+  char buf[32];
   str->append(STRING_WITH_LEN("cast("));
   args[0]->print(str, query_type);
   str->append(STRING_WITH_LEN(" as "));
   str->append(cast_type());
+  if (decimals)
+  {
+    str->append('(');
+    str->append(llstr(decimals, buf));
+    str->append(')');
+  }
   str->append(')');
 }
 
@@ -2530,13 +2253,13 @@ void Item_char_typecast::print(String *str, enum_query_type query_type)
   str->append(STRING_WITH_LEN("cast("));
   args[0]->print(str, query_type);
   str->append(STRING_WITH_LEN(" as char"));
-  if (cast_length >= 0)
+  if (cast_length != ~0U)
   {
     str->append('(');
     char buffer[20];
     // my_charset_bin is good enough for numbers
     String st(buffer, sizeof(buffer), &my_charset_bin);
-    st.set((ulonglong)cast_length, &my_charset_bin);
+    st.set(static_cast<ulonglong>(cast_length), &my_charset_bin);
     str->append(st);
     str->append(')');
   }
@@ -2554,8 +2277,8 @@ String *Item_char_typecast::val_str(String *str)
   String *res;
   uint32 length;
 
-  if (cast_length >= 0 &&
-      ((unsigned) cast_length) > current_thd->variables.max_allowed_packet)
+  if (cast_length != ~0U &&
+      cast_length > current_thd->variables.max_allowed_packet)
   {
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 			ER_WARN_ALLOWED_PACKET_OVERFLOWED,
@@ -2577,10 +2300,15 @@ String *Item_char_typecast::val_str(String *str)
   }
   else
   {
-    // Convert character set if differ
+    /*
+      Convert character set if differ
+      from_cs is 0 in the case where the result set may vary between calls,
+      for example with dynamic columns.
+    */
     uint dummy_errors;
     if (!(res= args[0]->val_str(str)) ||
-        tmp_value.copy(res->ptr(), res->length(), from_cs,
+        tmp_value.copy(res->ptr(), res->length(),
+                       from_cs ? from_cs  : res->charset(),
                        cast_cs, &dummy_errors))
     {
       null_value= 1;
@@ -2596,7 +2324,7 @@ String *Item_char_typecast::val_str(String *str)
     and the result is longer than cast length, e.g.
     CAST('string' AS CHAR(1))
   */
-  if (cast_length >= 0)
+  if (cast_length != ~0U)
   {
     if (res->length() > (length= (uint32) res->charpos(cast_length)))
     {                                           // Safe even if const arg
@@ -2617,16 +2345,15 @@ String *Item_char_typecast::val_str(String *str)
                           err.ptr());
       res->length((uint) length);
     }
-    else if (cast_cs == &my_charset_bin && res->length() < (uint) cast_length)
+    else if (cast_cs == &my_charset_bin && res->length() < cast_length)
     {
-      if (res->alloced_length() < (uint) cast_length)
+      if (res->alloced_length() < cast_length)
       {
         str_value.alloc(cast_length);
         str_value.copy(*res);
         res= &str_value;
       }
-      bzero((char*) res->ptr() + res->length(),
-            (uint) cast_length - res->length());
+      bzero((char*) res->ptr() + res->length(), cast_length - res->length());
       res->length(cast_length);
     }
   }
@@ -2659,194 +2386,117 @@ void Item_char_typecast::fix_length_and_dec()
        and thus avoid unnecessary character set conversion.
      - If the argument is not a number, then from_cs is set to
        the argument's charset.
+     - If argument has a dynamic collation (can change from call to call)
+       we set from_cs to 0 as a marker that we have to take the collation
+       from the result string.
 
        Note (TODO): we could use repertoire technique here.
   */
-  from_cs= (args[0]->result_type() == INT_RESULT || 
-            args[0]->result_type() == DECIMAL_RESULT ||
-            args[0]->result_type() == REAL_RESULT) ?
-           (cast_cs->mbminlen == 1 ? cast_cs : &my_charset_latin1) :
-           args[0]->collation.collation;
-  charset_conversion= (cast_cs->mbmaxlen > 1) ||
+  from_cs= ((args[0]->result_type() == INT_RESULT || 
+             args[0]->result_type() == DECIMAL_RESULT ||
+             args[0]->result_type() == REAL_RESULT) ?
+            (cast_cs->mbminlen == 1 ? cast_cs : &my_charset_latin1) :
+            args[0]->dynamic_result() ? 0 :
+            args[0]->collation.collation);
+  charset_conversion= !from_cs || (cast_cs->mbmaxlen > 1) ||
                       (!my_charset_same(from_cs, cast_cs) &&
                        from_cs != &my_charset_bin &&
                        cast_cs != &my_charset_bin);
   collation.set(cast_cs, DERIVATION_IMPLICIT);
-  char_length= (cast_length >= 0) ? cast_length :
+  char_length= ((cast_length != ~0U) ? cast_length :
                 args[0]->max_length /
-                (cast_cs == &my_charset_bin ? 1 : args[0]->collation.collation->mbmaxlen);
+                (cast_cs == &my_charset_bin ? 1 :
+                 args[0]->collation.collation->mbmaxlen));
   max_length= char_length * cast_cs->mbmaxlen;
 }
 
 
-String *Item_datetime_typecast::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-
-  if (!get_arg0_date(&ltime, TIME_FUZZY_DATE) &&
-      !make_datetime(ltime.second_part ? DATE_TIME_MICROSECOND : DATE_TIME, 
-		     &ltime, str))
-    return str;
-
-  null_value=1;
-  return 0;
-}
-
-
-longlong Item_datetime_typecast::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  if (get_arg0_date(&ltime,1))
-  {
-    null_value= 1;
-    return 0;
-  }
-
-  return TIME_to_ulonglong_datetime(&ltime);
-}
-
-
-bool Item_time_typecast::get_time(MYSQL_TIME *ltime)
+bool Item_time_typecast::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
-  bool res= get_arg0_time(ltime);
+  if (get_arg0_time(ltime))
+    return 1;
+  if (decimals < TIME_SECOND_PART_DIGITS)
+    ltime->second_part= sec_part_truncate(ltime->second_part, decimals);
   /*
-    For MYSQL_TIMESTAMP_TIME value we can have non-zero day part,
+    MYSQL_TIMESTAMP_TIME value can have non-zero day part,
     which we should not lose.
   */
-  if (ltime->time_type == MYSQL_TIMESTAMP_DATETIME)
+  if (ltime->time_type != MYSQL_TIMESTAMP_TIME)
     ltime->year= ltime->month= ltime->day= 0;
   ltime->time_type= MYSQL_TIMESTAMP_TIME;
-  return res;
-}
-
-
-longlong Item_time_typecast::val_int()
-{
-  MYSQL_TIME ltime;
-  if (get_time(&ltime))
-  {
-    null_value= 1;
-    return 0;
-  }
-  return (ltime.neg ? -1 : 1) *
-    (longlong) ((ltime.hour)*10000 + ltime.minute*100 + ltime.second);
-}
-
-String *Item_time_typecast::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-
-  if (!get_arg0_time(&ltime) &&
-      !make_datetime(ltime.second_part ? TIME_MICROSECOND : TIME_ONLY,
-		     &ltime, str))
-    return str;
-
-  null_value=1;
   return 0;
 }
 
 
 bool Item_date_typecast::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
-  bool res= get_arg0_date(ltime, TIME_FUZZY_DATE);
+  if (get_arg0_date(ltime, TIME_FUZZY_DATE))
+    return 1;
+
   ltime->hour= ltime->minute= ltime->second= ltime->second_part= 0;
   ltime->time_type= MYSQL_TIMESTAMP_DATE;
-  return res;
-}
 
-
-bool Item_date_typecast::get_time(MYSQL_TIME *ltime)
-{
-  bzero((char *)ltime, sizeof(MYSQL_TIME));
-  return args[0]->null_value;
-}
-
-
-String *Item_date_typecast::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-
-  if (!get_arg0_date(&ltime, TIME_FUZZY_DATE) &&
-      !str->alloc(MAX_DATE_STRING_REP_LENGTH))
+  int unused;
+  if (check_date(ltime, ltime->year || ltime->month || ltime->day,
+                 fuzzy_date, &unused))
   {
-    make_date((DATE_TIME_FORMAT *) 0, &ltime, str);
-    return str;
+    ErrConvTime str(ltime);
+    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                                 &str, MYSQL_TIMESTAMP_DATE, 0);
+    return (null_value= 1);
   }
-
-  null_value=1;
-  return 0;
+  return (null_value= 0);
 }
 
-longlong Item_date_typecast::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-  if ((null_value= args[0]->get_date(&ltime, TIME_FUZZY_DATE)))
-    return 0;
-  return (longlong) (ltime.year * 10000L + ltime.month * 100 + ltime.day);
-}
-
-/**
-  MAKEDATE(a,b) is a date function that creates a date value 
-  from a year and day value.
 
-  NOTES:
-    As arguments are integers, we can't know if the year is a 2 digit or 4 digit year.
-    In this case we treat all years < 100 as 2 digit years. Ie, this is not safe
-    for dates between 0000-01-01 and 0099-12-31
-*/
-
-String *Item_func_makedate::val_str(String *str)
+bool Item_datetime_typecast::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME l_time;
-  long daynr=  (long) args[1]->val_int();
-  long year= (long) args[0]->val_int();
-  long days;
+  if (get_arg0_date(ltime, fuzzy_date & ~TIME_TIME_ONLY))
+    return 1;
 
-  if (args[0]->null_value || args[1]->null_value ||
-      year < 0 || year > 9999 || daynr <= 0)
-    goto err;
+  if (decimals < TIME_SECOND_PART_DIGITS)
+    ltime->second_part= sec_part_truncate(ltime->second_part, decimals);
 
-  if (year < 100)
-    year= year_2000_handling(year);
 
-  days= calc_daynr(year,1,1) + daynr - 1;
-  /* Day number from year 0 to 9999-12-31 */
-  if (days >= 0 && days <= MAX_DAY_NUMBER)
+  /*
+    ltime is valid MYSQL_TYPE_TIME (according to fuzzy_date).
+    But not every valid TIME value is a valid DATETIME value!
+  */
+  if (ltime->time_type == MYSQL_TIMESTAMP_TIME)
   {
-    null_value=0;
-    get_date_from_daynr(days,&l_time.year,&l_time.month,&l_time.day);
-    if (str->alloc(MAX_DATE_STRING_REP_LENGTH))
-      goto err;
-    make_date((DATE_TIME_FORMAT *) 0, &l_time, str);
-    return str;
+    if (ltime->neg)
+    {
+      ErrConvTime str(ltime);
+      make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                                   &str, MYSQL_TIMESTAMP_DATETIME, 0);
+      return (null_value= 1);
+    }
+    
+    uint day= ltime->hour/24;
+    ltime->hour %= 24;
+    ltime->month= day / 31;
+    ltime->day= day % 31;
   }
 
-err:
-  null_value=1;
+  ltime->time_type= MYSQL_TIMESTAMP_DATETIME;
   return 0;
 }
 
 
-/*
+/**
   MAKEDATE(a,b) is a date function that creates a date value 
   from a year and day value.
 
   NOTES:
-    As arguments are integers, we can't know if the year is a 2 digit or 4 digit year.
-    In this case we treat all years < 100 as 2 digit years. Ie, this is not safe
-    for dates between 0000-01-01 and 0099-12-31
+    As arguments are integers, we can't know if the year is a 2 digit
+    or 4 digit year.  In this case we treat all years < 100 as 2 digit
+    years. Ie, this is not safe for dates between 0000-01-01 and
+    0099-12-31
 */
 
-longlong Item_func_makedate::val_int()
+bool Item_func_makedate::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME l_time;
   long daynr=  (long) args[1]->val_int();
   long year= (long) args[0]->val_int();
   long days;
@@ -2860,25 +2510,23 @@ longlong Item_func_makedate::val_int()
 
   days= calc_daynr(year,1,1) + daynr - 1;
   /* Day number from year 0 to 9999-12-31 */
-  if (days >= 0 && days < MAX_DAY_NUMBER)
+  if (days >= 0 && days <= MAX_DAY_NUMBER)
   {
-    null_value=0;
-    get_date_from_daynr(days,&l_time.year,&l_time.month,&l_time.day);
-    return (longlong) (l_time.year * 10000L + l_time.month * 100 + l_time.day);
+    bzero(ltime, sizeof(*ltime));
+    ltime->time_type= MYSQL_TIMESTAMP_DATE;
+    get_date_from_daynr(days, &ltime->year, &ltime->month, &ltime->day);
+    return (null_value= 0);
   }
 
 err:
-  null_value= 1;
-  return 0;
+  return (null_value= 1);
 }
 
 
 void Item_func_add_time::fix_length_and_dec()
 {
   enum_field_types arg0_field_type;
-  decimals=0;
-  fix_length_and_charset_datetime(MAX_DATETIME_FULL_WIDTH);
-  maybe_null= 1;
+  decimals= max(args[0]->decimals, args[1]->decimals);
 
   /*
     The field type for the result of an Item_func_add_time function is defined
@@ -2898,6 +2546,7 @@ void Item_func_add_time::fix_length_and_dec()
     cached_field_type= MYSQL_TYPE_DATETIME;
   else if (arg0_field_type == MYSQL_TYPE_TIME)
     cached_field_type= MYSQL_TYPE_TIME;
+  Item_temporal_func::fix_length_and_dec();
 }
 
 /**
@@ -2910,104 +2559,79 @@ void Item_func_add_time::fix_length_and_dec()
   Result: Time value or datetime value
 */
 
-MYSQL_TIME *Item_func_add_time::val_datetime(MYSQL_TIME *time,
-                                             date_time_format_types *format)
+bool Item_func_add_time::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME l_time1, l_time2;
   bool is_time= 0;
   long days, microseconds;
   longlong seconds;
-  int l_sign= sign;
+  int l_sign= sign, was_cut= 0;
+  uint dec= decimals;
 
-  null_value=0;
   if (is_date)                        // TIMESTAMP function
   {
     if (get_arg0_date(&l_time1, TIME_FUZZY_DATE) || 
         args[1]->get_time(&l_time2) ||
         l_time1.time_type == MYSQL_TIMESTAMP_TIME || 
         l_time2.time_type != MYSQL_TIMESTAMP_TIME)
-      goto null_date;
+      return (null_value= 1);
   }
   else                                // ADDTIME function
   {
     if (args[0]->get_time(&l_time1) || 
         args[1]->get_time(&l_time2) ||
         l_time2.time_type == MYSQL_TIMESTAMP_DATETIME)
-      goto null_date;
+      return (null_value= 1);
     is_time= (l_time1.time_type == MYSQL_TIMESTAMP_TIME);
   }
   if (l_time1.neg != l_time2.neg)
     l_sign= -l_sign;
   
-  bzero((char *)time, sizeof(MYSQL_TIME));
+  bzero(ltime, sizeof(*ltime));
   
-  time->neg= calc_time_diff(&l_time1, &l_time2, -l_sign,
-                            &seconds, &microseconds);
+  ltime->neg= calc_time_diff(&l_time1, &l_time2, -l_sign,
+			      &seconds, &microseconds);
 
   /*
     If first argument was negative and diff between arguments
     is non-zero we need to swap sign to get proper result.
   */
   if (l_time1.neg && (seconds || microseconds))
-    time->neg= 1 - time->neg;         // Swap sign of result
+    ltime->neg= 1-ltime->neg;         // Swap sign of result
 
-  if (!is_time && time->neg)
-    goto null_date;
+  if (!is_time && ltime->neg)
+    return (null_value= 1);
 
   days= (long)(seconds/86400L);
 
-  calc_time_from_sec(time, (long)(seconds%86400L), microseconds);
+  calc_time_from_sec(ltime, (long)(seconds%86400L), microseconds);
+
+  ltime->time_type= is_time ? MYSQL_TIMESTAMP_TIME : MYSQL_TIMESTAMP_DATETIME;
+
+  if (cached_field_type == MYSQL_TYPE_STRING &&
+      (l_time1.second_part || l_time2.second_part))
+    dec= TIME_SECOND_PART_DIGITS;
 
   if (!is_time)
   {
-    get_date_from_daynr(days, &time->year, &time->month, &time->day);
-    *format= l_time1.second_part || l_time2.second_part ?
-             DATE_TIME_MICROSECOND : DATE_TIME;
-    if (time->day)
-      return time;
-    goto null_date;
+    get_date_from_daynr(days,&ltime->year,&ltime->month,&ltime->day);
+    if (!ltime->day)
+      return (null_value= 1);
+    return (null_value= 0);
   }
-  *format= l_time1.second_part || l_time2.second_part ?
-           TIME_MICROSECOND : TIME_ONLY;
-  time->hour+= days*24;
-  return time;
-
-null_date:
-  null_value=1;
-  return 0;
-}
-
-
-String *Item_func_add_time::val_str(String *str)
-{
-  MYSQL_TIME ltime;
-  date_time_format_types format;
-
-  val_datetime(&ltime, &format);
-
-  if (null_value)
-    return 0;
-
-  if (!make_datetime_with_warn(format, &ltime, str))
-    return str;
-
-  null_value= 1;
-  return 0;
-}
-
-
-longlong Item_func_add_time::val_int()
-{
-  MYSQL_TIME ltime;
-  date_time_format_types format;
+  
+  ltime->hour+= days*24;
 
-  val_datetime(&ltime, &format);
+  MYSQL_TIME copy= *ltime;
+  ErrConvTime str(&copy);
 
-  if (null_value)
-    return 0;
+  check_time_range(ltime, decimals, &was_cut);
+  if (was_cut)
+    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                                 &str, MYSQL_TIMESTAMP_TIME, NullS);
 
-  return TIME_to_ulonglong_datetime(&ltime);
+  return (null_value= 0);
 }
 
 
@@ -3040,19 +2664,23 @@ void Item_func_add_time::print(String *str, enum_query_type query_type)
   Result: Time value
 */
 
-String *Item_func_timediff::val_str(String *str)
+bool Item_func_timediff::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   DBUG_ASSERT(fixed == 1);
   longlong seconds;
   long microseconds;
-  int l_sign= 1;
-  MYSQL_TIME l_time1 ,l_time2, l_time3;
+  int l_sign= 1, was_cut= 0;
+  MYSQL_TIME l_time1,l_time2,l_time3;
+  ErrConvTime str(&l_time3);
+
+  /* the following may be true in, for example, date_add(timediff(...), ... */
+  if (fuzzy_date & TIME_NO_ZERO_IN_DATE)
+    return (null_value= 1);
 
-  null_value= 0;  
   if (args[0]->get_time(&l_time1) ||
       args[1]->get_time(&l_time2) ||
       l_time1.time_type != l_time2.time_type)
-    goto null_date;
+    return (null_value= 1);
 
   if (l_time1.neg != l_time2.neg)
     l_sign= -l_sign;
@@ -3070,16 +2698,27 @@ String *Item_func_timediff::val_str(String *str)
   if (l_time1.neg && (seconds || microseconds))
     l_time3.neg= 1-l_time3.neg;         // Swap sign of result
 
+  /*
+    seconds is longlong, when casted to long it may become a small number
+    even if the original seconds value was too large and invalid.
+    as a workaround we limit seconds by a large invalid long number
+    ("invalid" means > TIME_MAX_SECOND)
+  */
+  set_if_smaller(seconds, INT_MAX32);
+
   calc_time_from_sec(&l_time3, (long) seconds, microseconds);
 
-  if (!make_datetime_with_warn(l_time1.second_part || l_time2.second_part ?
-                               TIME_MICROSECOND : TIME_ONLY,
-                               &l_time3, str))
-    return str;
+  if ((fuzzy_date & TIME_NO_ZERO_DATE) && (seconds == 0) &&
+      (microseconds == 0))
+    return (null_value= 1);
 
-null_date:
-  null_value=1;
-  return 0;
+  *ltime= l_time3;
+  check_time_range(ltime, decimals, &was_cut);
+
+  if (was_cut)
+    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                                 &str, MYSQL_TIMESTAMP_TIME, NullS);
+  return (null_value= 0);
 }
 
 /**
@@ -3088,26 +2727,21 @@ null_date:
   Result: Time value
 */
 
-String *Item_func_maketime::val_str(String *str)
+bool Item_func_maketime::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
   bool overflow= 0;
 
   longlong hour=   args[0]->val_int();
   longlong minute= args[1]->val_int();
   longlong second= args[2]->val_int();
 
-  if ((null_value=(args[0]->null_value || 
-                   args[1]->null_value ||
-                   args[2]->null_value ||
-                   minute < 0 || minute > 59 ||
-                   second < 0 || second > 59 ||
-                   str->alloc(MAX_DATE_STRING_REP_LENGTH))))
-    return 0;
+  if (args[0]->null_value || args[1]->null_value || args[2]->null_value ||
+       minute < 0 || minute > 59 || second < 0 || second > 59)
+    return (null_value= 1);
 
-  bzero((char *)&ltime, sizeof(ltime));
-  ltime.neg= 0;
+  bzero(ltime, sizeof(*ltime));
+  ltime->time_type= MYSQL_TIMESTAMP_TIME;
 
   /* Check for integer overflows */
   if (hour < 0)
@@ -3115,22 +2749,22 @@ String *Item_func_maketime::val_str(String *str)
     if (args[0]->unsigned_flag)
       overflow= 1;
     else
-      ltime.neg= 1;
+      ltime->neg= 1;
   }
-  if (-hour > UINT_MAX || hour > UINT_MAX)
+  if (-hour > TIME_MAX_HOUR || hour > TIME_MAX_HOUR)
     overflow= 1;
 
   if (!overflow)
   {
-    ltime.hour=   (uint) ((hour < 0 ? -hour : hour));
-    ltime.minute= (uint) minute;
-    ltime.second= (uint) second;
+    ltime->hour=   (uint) ((hour < 0 ? -hour : hour));
+    ltime->minute= (uint) minute;
+    ltime->second= (uint) second;
   }
   else
   {
-    ltime.hour= TIME_MAX_HOUR;
-    ltime.minute= TIME_MAX_MINUTE;
-    ltime.second= TIME_MAX_SECOND;
+    ltime->hour= TIME_MAX_HOUR;
+    ltime->minute= TIME_MAX_MINUTE;
+    ltime->second= TIME_MAX_SECOND;
     char buf[28];
     char *ptr= longlong10_to_str(hour, buf, args[0]->unsigned_flag ? 10 : -10);
     int len = (int)(ptr - buf) + sprintf(ptr, ":%02u:%02u", (uint)minute, (uint)second);
@@ -3139,12 +2773,7 @@ String *Item_func_maketime::val_str(String *str)
                                  NullS);
   }
 
-  if (make_time_with_warn((DATE_TIME_FORMAT *) 0, &ltime, str))
-  {
-    null_value= 1;
-    return 0;
-  }
-  return str;
+  return (null_value= 0);
 }
 
 
@@ -3160,7 +2789,7 @@ longlong Item_func_microsecond::val_int()
 {
   DBUG_ASSERT(fixed == 1);
   MYSQL_TIME ltime;
-  if (!get_arg0_time(&ltime))
+  if (!get_arg0_date(&ltime, TIME_TIME_ONLY))
     return ltime.second_part;
   return 0;
 }
@@ -3175,8 +2804,8 @@ longlong Item_func_timestamp_diff::val_int()
   int neg= 1;
 
   null_value= 0;  
-  if (args[0]->get_date(&ltime1, TIME_NO_ZERO_DATE) ||
-      args[1]->get_date(&ltime2, TIME_NO_ZERO_DATE))
+  if (args[0]->get_date(&ltime1, TIME_NO_ZERO_DATE | TIME_NO_ZERO_IN_DATE) ||
+      args[1]->get_date(&ltime2, TIME_NO_ZERO_DATE | TIME_NO_ZERO_IN_DATE))
     goto null_date;
 
   if (calc_time_diff(&ltime2,&ltime1, 1,
@@ -3426,9 +3055,9 @@ get_date_time_result_type(const char *format, uint length)
           have all types of date-time components and can end our search.
         */
 	return DATE_TIME_MICROSECOND;
+      }
     }
   }
-  }
 
   /* We don't have all three types of date-time components */
   if (frac_second_used)
@@ -3445,42 +3074,39 @@ get_date_time_result_type(const char *format, uint length)
 
 void Item_func_str_to_date::fix_length_and_dec()
 {
-  maybe_null= 1;
-  decimals=0;
-  cached_format_type= DATE_TIME;
   cached_field_type= MYSQL_TYPE_DATETIME;
-  max_length= MAX_DATETIME_FULL_WIDTH*MY_CHARSET_BIN_MB_MAXLEN;
-  cached_timestamp_type= MYSQL_TIMESTAMP_NONE;
-  sql_mode= (current_thd->variables.sql_mode &
-             (MODE_NO_ZERO_IN_DATE | MODE_NO_ZERO_DATE));
+  decimals= NOT_FIXED_DEC;
   if ((const_item= args[1]->const_item()))
   {
     char format_buff[64];
     String format_str(format_buff, sizeof(format_buff), &my_charset_bin);
     String *format= args[1]->val_str(&format_str);
+    decimals= 0;
     if (!args[1]->null_value)
     {
-      cached_format_type= get_date_time_result_type(format->ptr(),
-                                                    format->length());
+      date_time_format_types cached_format_type=
+        get_date_time_result_type(format->ptr(), format->length());
       switch (cached_format_type) {
       case DATE_ONLY:
-        cached_timestamp_type= MYSQL_TIMESTAMP_DATE;
         cached_field_type= MYSQL_TYPE_DATE; 
-        max_length= MAX_DATE_WIDTH * MY_CHARSET_BIN_MB_MAXLEN;
         break;
-      case TIME_ONLY:
       case TIME_MICROSECOND:
-        cached_timestamp_type= MYSQL_TIMESTAMP_TIME;
+        decimals= 6;
+        /* fall through */
+      case TIME_ONLY:
         cached_field_type= MYSQL_TYPE_TIME; 
-        max_length= MAX_TIME_WIDTH * MY_CHARSET_BIN_MB_MAXLEN;
         break;
-      default:
-        cached_timestamp_type= MYSQL_TIMESTAMP_DATETIME;
+      case DATE_TIME_MICROSECOND:
+        decimals= 6;
+        /* fall through */
+      case DATE_TIME:
         cached_field_type= MYSQL_TYPE_DATETIME; 
         break;
       }
     }
   }
+  cached_timestamp_type= mysql_type_to_time_type(cached_field_type);
+  Item_temporal_func::fix_length_and_dec();
 }
 
 
@@ -3489,22 +3115,20 @@ bool Item_func_str_to_date::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
   DATE_TIME_FORMAT date_time_format;
   char val_buff[64], format_buff[64];
   String val_string(val_buff, sizeof(val_buff), &my_charset_bin), *val;
-  String format_str(format_buff, sizeof(format_buff), &my_charset_bin), *format;
+  String format_str(format_buff, sizeof(format_buff), &my_charset_bin),
+    *format;
 
   val=    args[0]->val_str(&val_string);
   format= args[1]->val_str(&format_str);
   if (args[0]->null_value || args[1]->null_value)
-    goto null_date;
+    return (null_value=1);
 
-  null_value= 0;
-  bzero((char*) ltime, sizeof(*ltime));
   date_time_format.format.str=    (char*) format->ptr();
   date_time_format.format.length= format->length();
   if (extract_date_time(&date_time_format, val->ptr(), val->length(),
-			ltime, cached_timestamp_type, 0, "datetime") ||
-      ((fuzzy_date & TIME_NO_ZERO_DATE) &&
-       (ltime->year == 0 || ltime->month == 0 || ltime->day == 0)))
-    goto null_date;
+			ltime, cached_timestamp_type, 0, "datetime",
+                        fuzzy_date))
+    return (null_value=1);
   if (cached_timestamp_type == MYSQL_TIMESTAMP_TIME && ltime->day)
   {
     /*
@@ -3515,57 +3139,7 @@ bool Item_func_str_to_date::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
     ltime->hour+= ltime->day*24;
     ltime->day= 0;
   }
-  return 0;
-
-null_date:
-  if (val && (fuzzy_date & TIME_NO_ZERO_DATE))
-  {
-    char buff[128];
-    strmake(buff, val->ptr(), min(val->length(), sizeof(buff)-1));
-    push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                        ER_WRONG_VALUE_FOR_TYPE, ER(ER_WRONG_VALUE_FOR_TYPE),
-                        "datetime", buff, "str_to_date");
-  }
-  return (null_value=1);
-}
-
-
-String *Item_func_str_to_date::val_str(String *str)
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-
-  if (Item_func_str_to_date::get_date(&ltime, TIME_FUZZY_DATE | sql_mode))
-    return 0;
-
-  if (!make_datetime((const_item ? cached_format_type :
-		     (ltime.second_part ? DATE_TIME_MICROSECOND : DATE_TIME)),
-		     &ltime, str))
-    return str;
-  return 0;
-}
-
-
-longlong Item_func_str_to_date::val_int()
-{
-  DBUG_ASSERT(fixed == 1);
-  MYSQL_TIME ltime;
-
-  if (Item_func_str_to_date::get_date(&ltime, TIME_FUZZY_DATE | sql_mode))
-    return 0;
-
-  if (const_item)
-  {
-    switch (cached_field_type) {
-    case MYSQL_TYPE_DATE:
-      return TIME_to_ulonglong_date(&ltime);
-    case MYSQL_TYPE_TIME:
-      return TIME_to_ulonglong_time(&ltime);
-    default:
-      return TIME_to_ulonglong_datetime(&ltime);
-    }
-  }
-  return TIME_to_ulonglong_datetime(&ltime);
+  return (null_value= 0);
 }
 
 
@@ -3573,11 +3147,7 @@ bool Item_func_last_day::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
 {
   if (get_arg0_date(ltime, fuzzy_date & ~TIME_FUZZY_DATE) ||
       (ltime->month == 0))
-  {
-    null_value= 1;
-    return 1;
-  }
-  null_value= 0;
+    return (null_value=1);
   uint month_idx= ltime->month-1;
   ltime->day= days_in_month[month_idx];
   if ( month_idx == 1 && calc_days_in_year(ltime->year) == 366)
@@ -3585,5 +3155,5 @@ bool Item_func_last_day::get_date(MYSQL_TIME *ltime, uint fuzzy_date)
   ltime->hour= ltime->minute= ltime->second= 0;
   ltime->second_part= 0;
   ltime->time_type= MYSQL_TIMESTAMP_DATE;
-  return 0;
+  return (null_value= 0);
 }
diff --git a/sql/item_timefunc.h b/sql/item_timefunc.h
index c6e081df182..239f7e92bba 100644
--- a/sql/item_timefunc.h
+++ b/sql/item_timefunc.h
@@ -1,7 +1,7 @@
 #ifndef ITEM_TIMEFUNC_INCLUDED
 #define ITEM_TIMEFUNC_INCLUDED
-
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,8 +30,20 @@ enum date_time_format_types
   TIME_ONLY= 0, TIME_MICROSECOND, DATE_ONLY, DATE_TIME, DATE_TIME_MICROSECOND
 };
 
-bool get_interval_value(Item *args,interval_type int_type,
-			       String *str_value, INTERVAL *interval);
+static inline enum enum_mysql_timestamp_type
+mysql_type_to_time_type(enum enum_field_types mysql_type)
+{
+  switch(mysql_type) {
+  case MYSQL_TYPE_TIME: return MYSQL_TIMESTAMP_TIME;
+  case MYSQL_TYPE_TIMESTAMP:
+  case MYSQL_TYPE_DATETIME: return MYSQL_TIMESTAMP_DATETIME;
+  case MYSQL_TYPE_NEWDATE:
+  case MYSQL_TYPE_DATE: return MYSQL_TIMESTAMP_DATE;
+  default: return MYSQL_TIMESTAMP_ERROR;
+  }
+}
+
+bool get_interval_value(Item *args,interval_type int_type, INTERVAL *interval);
 
 class Item_func_period_add :public Item_int_func
 {
@@ -396,17 +408,39 @@ class Item_func_dayname :public Item_func_weekday
 };
 
 
-class Item_func_unix_timestamp :public Item_int_func
+class Item_func_seconds_hybrid: public Item_func_numhybrid
 {
-  String value;
 public:
-  Item_func_unix_timestamp() :Item_int_func() {}
-  Item_func_unix_timestamp(Item *a) :Item_int_func(a) {}
-  longlong val_int();
+  Item_func_seconds_hybrid() :Item_func_numhybrid() {}
+  Item_func_seconds_hybrid(Item *a) :Item_func_numhybrid(a) {}
+  void fix_num_length_and_dec()
+  {
+    if (arg_count)
+      decimals= args[0]->decimals;
+    set_if_smaller(decimals, TIME_SECOND_PART_DIGITS);
+    max_length=17 + (decimals ? decimals + 1 : 0);
+  }
+  void find_num_type() { hybrid_type= decimals ? DECIMAL_RESULT : INT_RESULT; }
+  double real_op() { DBUG_ASSERT(0); return 0; }
+  String *str_op(String *str) { DBUG_ASSERT(0); return 0; }
+};
+
+
+class Item_func_unix_timestamp :public Item_func_seconds_hybrid
+{
+  bool get_timestamp_value(my_time_t *seconds, ulong *second_part);
+public:
+  Item_func_unix_timestamp() :Item_func_seconds_hybrid() {}
+  Item_func_unix_timestamp(Item *a) :Item_func_seconds_hybrid(a) {}
   const char *func_name() const { return "unix_timestamp"; }
   enum_monotonicity_info get_monotonicity_info() const;
   longlong val_int_endpoint(bool left_endp, bool *incl_endp);
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
+  void fix_num_length_and_dec()
+  {
+    maybe_null= false;
+    Item_func_seconds_hybrid::fix_num_length_and_dec();
+  }
   /*
     UNIX_TIMESTAMP() depends on the current timezone
     (and thus may not be used as a partitioning function)
@@ -416,11 +450,6 @@ public:
   {
     return !has_timestamp_args();
   }
-  void fix_length_and_dec()
-  {
-    decimals=0;
-    max_length=10*MY_CHARSET_BIN_MB_MAXLEN;
-  }
   bool check_vcol_func_processor(uchar *int_arg) 
   {
     /*
@@ -429,20 +458,20 @@ public:
     */
     return trace_unsupported_by_check_vcol_func_processor(func_name());
   }
+  longlong int_op();
+  my_decimal *decimal_op(my_decimal* buf);
 };
 
 
-class Item_func_time_to_sec :public Item_int_func
+class Item_func_time_to_sec :public Item_func_seconds_hybrid
 {
 public:
-  Item_func_time_to_sec(Item *item) :Item_int_func(item) {}
-  longlong val_int();
+  Item_func_time_to_sec(Item *item) :Item_func_seconds_hybrid(item) {}
   const char *func_name() const { return "time_to_sec"; }
-  void fix_length_and_dec()
+  void fix_num_length_and_dec()
   {
-    maybe_null= TRUE;
-    decimals=0;
-    max_length=10*MY_CHARSET_BIN_MB_MAXLEN;
+    maybe_null= true;
+    Item_func_seconds_hybrid::fix_num_length_and_dec();
   }
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
   bool check_vcol_func_processor(uchar *int_arg) { return FALSE;}
@@ -450,128 +479,77 @@ public:
   {
     return !has_time_args();
   }
+  longlong int_op();
+  my_decimal *decimal_op(my_decimal* buf);
 };
 
 
-/*
-  This can't be a Item_str_func, because the val_real() functions are special
-*/
-
-class Item_date :public Item_func
+class Item_temporal_func: public Item_func
 {
+  ulonglong sql_mode;
 public:
-  Item_date() :Item_func() {}
-  Item_date(Item *a) :Item_func(a) {}
+  Item_temporal_func() :Item_func() {}
+  Item_temporal_func(Item *a) :Item_func(a) {}
+  Item_temporal_func(Item *a, Item *b) :Item_func(a,b) {}
+  Item_temporal_func(Item *a, Item *b, Item *c) :Item_func(a,b,c) {}
   enum Item_result result_type () const { return STRING_RESULT; }
-  enum_field_types field_type() const { return MYSQL_TYPE_DATE; }
   CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
+  enum_field_types field_type() const { return MYSQL_TYPE_DATETIME; }
   String *val_str(String *str);
   longlong val_int();
-  double val_real() { return val_real_from_decimal(); }
-  const char *func_name() const { return "date"; }
-  void fix_length_and_dec()
-  { 
-    decimals= 0;
-    fix_length_and_charset_datetime(MAX_DATE_WIDTH);
-  }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  bool result_as_longlong() { return TRUE; }
+  double val_real();
+  bool get_date(MYSQL_TIME *res, uint fuzzy_date) { DBUG_ASSERT(0); return 1; }
   my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_date(decimal_value);
-  }
+  { return  val_decimal_from_date(decimal_value); }
+  Field *tmp_table_field(TABLE *table)
+  { return tmp_table_field_from_field_type(table, 0); }
   int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_date_in_field(field);
-  }
+  { return save_date_in_field(field); }
+  void fix_length_and_dec();
 };
 
 
-class Item_date_func :public Item_str_func
+class Item_datefunc :public Item_temporal_func
 {
 public:
-  Item_date_func() :Item_str_func() {}
-  Item_date_func(Item *a) :Item_str_func(a) {}
-  Item_date_func(Item *a,Item *b) :Item_str_func(a,b) {}
-  Item_date_func(Item *a,Item *b, Item *c) :Item_str_func(a,b,c) {}
-  enum_field_types field_type() const { return MYSQL_TYPE_DATETIME; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  bool result_as_longlong() { return TRUE; }
-  double val_real() { return (double) val_int(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_date(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_date_in_field(field);
-  }
+  Item_datefunc() :Item_temporal_func() { }
+  Item_datefunc(Item *a) :Item_temporal_func(a) { }
+  enum_field_types field_type() const { return MYSQL_TYPE_DATE; }
 };
 
 
-class Item_str_timefunc :public Item_str_func
+class Item_timefunc :public Item_temporal_func
 {
 public:
-  Item_str_timefunc() :Item_str_func() {}
-  Item_str_timefunc(Item *a) :Item_str_func(a) {}
-  Item_str_timefunc(Item *a,Item *b) :Item_str_func(a,b) {}
-  Item_str_timefunc(Item *a, Item *b, Item *c) :Item_str_func(a, b ,c) {}
+  Item_timefunc() :Item_temporal_func() {}
+  Item_timefunc(Item *a) :Item_temporal_func(a) {}
+  Item_timefunc(Item *a,Item *b) :Item_temporal_func(a,b) {}
+  Item_timefunc(Item *a, Item *b, Item *c) :Item_temporal_func(a, b ,c) {}
   enum_field_types field_type() const { return MYSQL_TYPE_TIME; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  void fix_length_and_dec()
-  {
-    decimals= DATETIME_DEC;
-    fix_length_and_charset_datetime(MAX_TIME_WIDTH);
-  }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  double val_real() { return val_real_from_decimal(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_time(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_time_in_field(field);
-  }
-  longlong val_int() { return val_int_from_decimal(); }
-  bool result_as_longlong() { return TRUE; }
 };
 
 
 /* Abstract CURTIME function. Children should define what time zone is used */
 
-class Item_func_curtime :public Item_str_timefunc
+class Item_func_curtime :public Item_timefunc
 {
-  longlong value;
-  char buff[9*2+32];
-  uint buff_length;
+  MYSQL_TIME ltime;
 public:
-  Item_func_curtime() :Item_str_timefunc() {}
-  Item_func_curtime(Item *a) :Item_str_timefunc(a) {}
-  double val_real() { DBUG_ASSERT(fixed == 1); return (double) value; }
-  longlong val_int() { DBUG_ASSERT(fixed == 1); return value; }
-  String *val_str(String *str);
-  void fix_length_and_dec();
+  Item_func_curtime(uint dec) :Item_timefunc() { decimals= dec; }
+  bool fix_fields(THD *, Item **);
+  void fix_length_and_dec()
+  {
+    store_now_in_TIME(&ltime);
+    Item_timefunc::fix_length_and_dec();
+    maybe_null= false;
+  }
+  bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   /* 
     Abstract method that defines which time zone is used for conversion.
     Converts time current time in my_time_t representation to broken-down
     MYSQL_TIME representation using UTC-SYSTEM or per-thread time zone.
   */
   virtual void store_now_in_TIME(MYSQL_TIME *now_time)=0;
-  bool result_as_longlong() { return TRUE; }
   bool check_vcol_func_processor(uchar *int_arg) 
   {
     return trace_unsupported_by_check_vcol_func_processor(func_name());
@@ -582,8 +560,7 @@ public:
 class Item_func_curtime_local :public Item_func_curtime
 {
 public:
-  Item_func_curtime_local() :Item_func_curtime() {}
-  Item_func_curtime_local(Item *a) :Item_func_curtime(a) {}
+  Item_func_curtime_local(uint dec) :Item_func_curtime(dec) {}
   const char *func_name() const { return "curtime"; }
   virtual void store_now_in_TIME(MYSQL_TIME *now_time);
 };
@@ -592,8 +569,7 @@ public:
 class Item_func_curtime_utc :public Item_func_curtime
 {
 public:
-  Item_func_curtime_utc() :Item_func_curtime() {}
-  Item_func_curtime_utc(Item *a) :Item_func_curtime(a) {}
+  Item_func_curtime_utc(uint dec) :Item_func_curtime(dec) {}
   const char *func_name() const { return "utc_time"; }
   virtual void store_now_in_TIME(MYSQL_TIME *now_time);
 };
@@ -601,14 +577,11 @@ public:
 
 /* Abstract CURDATE function. See also Item_func_curtime. */
 
-class Item_func_curdate :public Item_date
+class Item_func_curdate :public Item_datefunc
 {
-  longlong value;
   MYSQL_TIME ltime;
 public:
-  Item_func_curdate() :Item_date() {}
-  longlong val_int() { DBUG_ASSERT(fixed == 1); return (value) ; }
-  String *val_str(String *str);
+  Item_func_curdate() :Item_datefunc() {}
   void fix_length_and_dec();
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   virtual void store_now_in_TIME(MYSQL_TIME *now_time)=0;
@@ -639,21 +612,19 @@ public:
 
 /* Abstract CURRENT_TIMESTAMP function. See also Item_func_curtime */
 
-class Item_func_now :public Item_date_func
+
+class Item_func_now :public Item_temporal_func
 {
-protected:
-  longlong value;
-  char buff[20*2+32];	// +32 to make my_snprintf_{8bit|ucs2} happy
-  uint buff_length;
   MYSQL_TIME ltime;
 public:
-  Item_func_now() :Item_date_func() {}
-  Item_func_now(Item *a) :Item_date_func(a) {}
-  enum Item_result result_type () const { return STRING_RESULT; }
-  longlong val_int() { DBUG_ASSERT(fixed == 1); return value; }
-  int save_in_field(Field *to, bool no_conversions);
-  String *val_str(String *str);
-  void fix_length_and_dec();
+  Item_func_now(uint dec) :Item_temporal_func() { decimals= dec; }
+  bool fix_fields(THD *, Item **);
+  void fix_length_and_dec()
+  {
+    store_now_in_TIME(&ltime);
+    Item_temporal_func::fix_length_and_dec();
+    maybe_null= false;
+  }
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   virtual void store_now_in_TIME(MYSQL_TIME *now_time)=0;
   bool check_vcol_func_processor(uchar *int_arg) 
@@ -666,8 +637,7 @@ public:
 class Item_func_now_local :public Item_func_now
 {
 public:
-  Item_func_now_local() :Item_func_now() {}
-  Item_func_now_local(Item *a) :Item_func_now(a) {}
+  Item_func_now_local(uint dec) :Item_func_now(dec) {}
   const char *func_name() const { return "now"; }
   virtual void store_now_in_TIME(MYSQL_TIME *now_time);
   virtual enum Functype functype() const { return NOW_FUNC; }
@@ -677,8 +647,7 @@ public:
 class Item_func_now_utc :public Item_func_now
 {
 public:
-  Item_func_now_utc() :Item_func_now() {}
-  Item_func_now_utc(Item *a) :Item_func_now(a) {}
+  Item_func_now_utc(uint dec) :Item_func_now(dec) {}
   const char *func_name() const { return "utc_timestamp"; }
   virtual void store_now_in_TIME(MYSQL_TIME *now_time);
 };
@@ -691,29 +660,24 @@ public:
 class Item_func_sysdate_local :public Item_func_now
 {
 public:
-  Item_func_sysdate_local() :Item_func_now() {}
-  Item_func_sysdate_local(Item *a) :Item_func_now(a) {}
+  Item_func_sysdate_local(uint dec) :Item_func_now(dec) {}
   bool const_item() const { return 0; }
   const char *func_name() const { return "sysdate"; }
   void store_now_in_TIME(MYSQL_TIME *now_time);
-  double val_real();
-  longlong val_int();
-  int save_in_field(Field *to, bool no_conversions);
-  String *val_str(String *str);
-  void fix_length_and_dec();
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   void update_used_tables()
   {
     Item_func_now::update_used_tables();
+    maybe_null= false;
     used_tables_cache|= RAND_TABLE_BIT;
   }
 };
 
 
-class Item_func_from_days :public Item_date
+class Item_func_from_days :public Item_datefunc
 {
 public:
-  Item_func_from_days(Item *a) :Item_date(a) {}
+  Item_func_from_days(Item *a) :Item_datefunc(a) {}
   const char *func_name() const { return "from_days"; }
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
@@ -742,13 +706,11 @@ public:
 };
 
 
-class Item_func_from_unixtime :public Item_date_func
+class Item_func_from_unixtime :public Item_temporal_func
 {
   THD *thd;
  public:
-  Item_func_from_unixtime(Item *a) :Item_date_func(a) {}
-  longlong val_int();
-  String *val_str(String *str);
+  Item_func_from_unixtime(Item *a) :Item_temporal_func(a) {}
   const char *func_name() const { return "from_unixtime"; }
   void fix_length_and_dec();
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
@@ -769,7 +731,7 @@ class Time_zone;
   tables can be used during this function calculation for loading time zone
   descriptions.
 */
-class Item_func_convert_tz :public Item_date_func
+class Item_func_convert_tz :public Item_temporal_func
 {
   /*
     If time zone parameters are constants we are caching objects that
@@ -781,9 +743,7 @@ class Item_func_convert_tz :public Item_date_func
   Time_zone *from_tz, *to_tz;
  public:
   Item_func_convert_tz(Item *a, Item *b, Item *c):
-    Item_date_func(a, b, c), from_tz_cached(0), to_tz_cached(0) {}
-  longlong val_int();
-  String *val_str(String *str);
+    Item_temporal_func(a, b, c), from_tz_cached(0), to_tz_cached(0) {}
   const char *func_name() const { return "convert_tz"; }
   void fix_length_and_dec();
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
@@ -791,61 +751,34 @@ class Item_func_convert_tz :public Item_date_func
 };
 
 
-class Item_func_sec_to_time :public Item_str_timefunc
+class Item_func_sec_to_time :public Item_timefunc
 {
 public:
-  Item_func_sec_to_time(Item *item) :Item_str_timefunc(item) {}
-  double val_real()
-  {
-    DBUG_ASSERT(fixed == 1);
-    return (double) Item_func_sec_to_time::val_int();
-  }
-  longlong val_int();
-  String *val_str(String *);
+  Item_func_sec_to_time(Item *item) :Item_timefunc(item) {}
+  bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   void fix_length_and_dec()
-  { 
-    Item_str_timefunc::fix_length_and_dec();
-    maybe_null=1;
+  {
+    decimals= args[0]->decimals;
+    Item_timefunc::fix_length_and_dec();
   }
   const char *func_name() const { return "sec_to_time"; }
-  bool result_as_longlong() { return TRUE; }
 };
 
 
-class Item_date_add_interval :public Item_date_func
+class Item_date_add_interval :public Item_temporal_func
 {
-  String value;
   enum_field_types cached_field_type;
-  String ascii_buf;
 public:
   const interval_type int_type; // keep it public
   const bool date_sub_interval; // keep it public
   Item_date_add_interval(Item *a,Item *b,interval_type type_arg,bool neg_arg)
-    :Item_date_func(a,b),int_type(type_arg), date_sub_interval(neg_arg) {}
-  String *val_str_ascii(String *str);
-  String *val_str(String *str)
-  {
-    return val_str_from_val_str_ascii(str, &ascii_buf);
-  }
+    :Item_temporal_func(a,b),int_type(type_arg), date_sub_interval(neg_arg) {}
   const char *func_name() const { return "date_add_interval"; }
   void fix_length_and_dec();
   enum_field_types field_type() const { return cached_field_type; }
-  CHARSET_INFO *charset_for_protocol(void) const
-  {
-    /*
-      DATE_ADD() can return DATE, DATETIME or VARCHAR depending on arguments.
-      Send using "binary" when DATE or DATETIME,
-      or using collation.collation when VARCHAR
-      (which was fixed from @collation_connection in fix_length_and_dec).
-    */
-    DBUG_ASSERT(fixed == 1);
-    return cached_field_type == MYSQL_TYPE_STRING ?
-                                collation.collation : &my_charset_bin;
-  }
-  longlong val_int();
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
   bool eq(const Item *item, bool binary_cmp) const;
-  virtual void print(String *str, enum_query_type query_type);
+  void print(String *str, enum_query_type query_type);
 };
 
 
@@ -861,7 +794,7 @@ class Item_extract :public Item_int_func
   const char *func_name() const { return "extract"; }
   void fix_length_and_dec();
   bool eq(const Item *item, bool binary_cmp) const;
-  virtual void print(String *str, enum_query_type query_type);
+  void print(String *str, enum_query_type query_type);
   bool check_partition_func_processor(uchar *int_arg) {return FALSE;}
   bool check_vcol_func_processor(uchar *int_arg) { return FALSE;}
   bool check_valid_arguments_processor(uchar *int_arg)
@@ -903,169 +836,84 @@ class Item_extract :public Item_int_func
 };
 
 
-class Item_typecast :public Item_str_func
-{
-public:
-  Item_typecast(Item *a) :Item_str_func(a) {}
-  String *val_str(String *a)
-  {
-    DBUG_ASSERT(fixed == 1);
-    String *tmp=args[0]->val_str(a);
-    null_value=args[0]->null_value;
-    if (tmp)
-      tmp->set_charset(collation.collation);
-    return tmp;
-  }
-  void fix_length_and_dec()
-  {
-    collation.set(&my_charset_bin);
-    max_length=args[0]->max_length;
-  }
-  virtual const char* cast_type() const= 0;
-  virtual void print(String *str, enum_query_type query_type);
-};
-
-
-class Item_typecast_maybe_null :public Item_typecast
+class Item_char_typecast :public Item_str_func
 {
-public:
-  Item_typecast_maybe_null(Item *a) :Item_typecast(a) { maybe_null= 1; }
-};
-
-
-class Item_char_typecast :public Item_typecast
-{
-  int cast_length;
+  uint cast_length;
   CHARSET_INFO *cast_cs, *from_cs;
   bool charset_conversion;
   String tmp_value;
 public:
-  Item_char_typecast(Item *a, int length_arg, CHARSET_INFO *cs_arg)
-    :Item_typecast(a), cast_length(length_arg), cast_cs(cs_arg) {}
+  Item_char_typecast(Item *a, uint length_arg, CHARSET_INFO *cs_arg)
+    :Item_str_func(a), cast_length(length_arg), cast_cs(cs_arg) {}
   enum Functype functype() const { return CHAR_TYPECAST_FUNC; }
   bool eq(const Item *item, bool binary_cmp) const;
   const char *func_name() const { return "cast_as_char"; }
-  const char* cast_type() const { return "char"; };
   String *val_str(String *a);
   void fix_length_and_dec();
-  virtual void print(String *str, enum_query_type query_type);
+  void print(String *str, enum_query_type query_type);
 };
 
 
-class Item_date_typecast :public Item_typecast_maybe_null
+class Item_temporal_typecast: public Item_temporal_func
+{
+public:
+  Item_temporal_typecast(Item *a) :Item_temporal_func(a) {}
+  virtual const char *cast_type() const = 0;
+  void print(String *str, enum_query_type query_type);
+  void fix_length_and_dec()
+  {
+    if (decimals == NOT_FIXED_DEC)
+      decimals= args[0]->decimals;
+    Item_temporal_func::fix_length_and_dec();
+  }
+};
+
+class Item_date_typecast :public Item_temporal_typecast
 {
 public:
-  Item_date_typecast(Item *a) :Item_typecast_maybe_null(a) {}
+  Item_date_typecast(Item *a) :Item_temporal_typecast(a) {}
   const char *func_name() const { return "cast_as_date"; }
-  String *val_str(String *str);
   bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
-  bool get_time(MYSQL_TIME *ltime);
   const char *cast_type() const { return "date"; }
   enum_field_types field_type() const { return MYSQL_TYPE_DATE; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  void fix_length_and_dec() { fix_length_and_charset_datetime(10); }
-  bool result_as_longlong() { return TRUE; }
-  longlong val_int();
-  double val_real() { return (double) val_int(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_date(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_date_in_field(field);
-  }
 };
 
 
-class Item_time_typecast :public Item_typecast_maybe_null
+class Item_time_typecast :public Item_temporal_typecast
 {
 public:
-  Item_time_typecast(Item *a) :Item_typecast_maybe_null(a) {}
+  Item_time_typecast(Item *a, uint dec_arg)
+    :Item_temporal_typecast(a) { decimals= dec_arg; }
   const char *func_name() const { return "cast_as_time"; }
-  String *val_str(String *str);
-  bool get_time(MYSQL_TIME *ltime);
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
   const char *cast_type() const { return "time"; }
   enum_field_types field_type() const { return MYSQL_TYPE_TIME; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  bool result_as_longlong() { return TRUE; }
-  longlong val_int();
-  double val_real() { return val_real_from_decimal(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_time(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_time_in_field(field);
-  }
-  void fix_length_and_dec()
-  { fix_length_and_charset_datetime(args[0]->max_char_length()); }
 };
 
 
-class Item_datetime_typecast :public Item_typecast_maybe_null
+class Item_datetime_typecast :public Item_temporal_typecast
 {
 public:
-  Item_datetime_typecast(Item *a) :Item_typecast_maybe_null(a) {}
+  Item_datetime_typecast(Item *a, uint dec_arg)
+    :Item_temporal_typecast(a) { decimals= dec_arg; }
   const char *func_name() const { return "cast_as_datetime"; }
-  String *val_str(String *str);
   const char *cast_type() const { return "datetime"; }
   enum_field_types field_type() const { return MYSQL_TYPE_DATETIME; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  void fix_length_and_dec()
-  {
-    fix_length_and_charset_datetime(MAX_DATETIME_FULL_WIDTH);
-    decimals= DATETIME_DEC;
-  }
-  bool result_as_longlong() { return TRUE; }
-  longlong val_int();
-  double val_real() { return val_real_from_decimal(); }
-  double val() { return (double) val_int(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    return  val_decimal_from_date(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    return save_date_in_field(field);
-  }
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
 };
 
-class Item_func_makedate :public Item_date_func
+
+class Item_func_makedate :public Item_temporal_func
 {
 public:
-  Item_func_makedate(Item *a,Item *b) :Item_date_func(a,b) {}
-  String *val_str(String *str);
+  Item_func_makedate(Item *a,Item *b) :Item_temporal_func(a,b) {}
   const char *func_name() const { return "makedate"; }
   enum_field_types field_type() const { return MYSQL_TYPE_DATE; }
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-  void fix_length_and_dec()
-  { 
-    decimals=0;
-    fix_length_and_charset_datetime(MAX_DATE_WIDTH);
-    maybe_null= 1;
-  }
-  longlong val_int();
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
 };
 
 
-class Item_func_add_time :public Item_str_func
+class Item_func_add_time :public Item_temporal_func
 {
   const bool is_date;
   int sign;
@@ -1073,66 +921,39 @@ class Item_func_add_time :public Item_str_func
 
 public:
   Item_func_add_time(Item *a, Item *b, bool type_arg, bool neg_arg)
-    :Item_str_func(a, b), is_date(type_arg) { sign= neg_arg ? -1 : 1; }
-  String *val_str(String *str);
+    :Item_temporal_func(a, b), is_date(type_arg) { sign= neg_arg ? -1 : 1; }
   enum_field_types field_type() const { return cached_field_type; }
   void fix_length_and_dec();
-  CHARSET_INFO *charset_for_protocol(void) const { return &my_charset_bin; }
-
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 0);
-  }
-  virtual void print(String *str, enum_query_type query_type);
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
+  void print(String *str, enum_query_type query_type);
   const char *func_name() const { return "add_time"; }
-  double val_real() { return val_real_from_decimal(); }
-  my_decimal *val_decimal(my_decimal *decimal_value)
-  {
-    DBUG_ASSERT(fixed == 1);
-    if (cached_field_type == MYSQL_TYPE_TIME)
-      return  val_decimal_from_time(decimal_value);
-    if (cached_field_type == MYSQL_TYPE_DATETIME)
-      return  val_decimal_from_date(decimal_value);
-    return Item_str_func::val_decimal(decimal_value);
-  }
-  int save_in_field(Field *field, bool no_conversions)
-  {
-    if (cached_field_type == MYSQL_TYPE_TIME)
-      return save_time_in_field(field);
-    if (cached_field_type == MYSQL_TYPE_DATETIME)
-      return save_date_in_field(field);
-    return Item_str_func::save_in_field(field, no_conversions);
-  }
-  longlong val_int();
-  MYSQL_TIME *val_datetime(MYSQL_TIME *time, date_time_format_types *format);
 };
 
-class Item_func_timediff :public Item_str_timefunc
+class Item_func_timediff :public Item_timefunc
 {
 public:
   Item_func_timediff(Item *a, Item *b)
-    :Item_str_timefunc(a, b) {}
-  String *val_str(String *str);
+    :Item_timefunc(a, b) {}
   const char *func_name() const { return "timediff"; }
   void fix_length_and_dec()
   {
-    Item_str_timefunc::fix_length_and_dec();
-    maybe_null= 1;
+    decimals= max(args[0]->decimals, args[1]->decimals);
+    Item_timefunc::fix_length_and_dec();
   }
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
 };
 
-class Item_func_maketime :public Item_str_timefunc
+class Item_func_maketime :public Item_timefunc
 {
 public:
   Item_func_maketime(Item *a, Item *b, Item *c)
-    :Item_str_timefunc(a, b, c) 
-  {
-    maybe_null= TRUE;
-  }
-  String *val_str(String *str);
+    :Item_timefunc(a, b, c) 
+  {}
   const char *func_name() const { return "maketime"; }
+  bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
 };
 
+
 class Item_func_microsecond :public Item_int_func
 {
 public:
@@ -1194,42 +1015,28 @@ public:
 };
 
 
-class Item_func_str_to_date :public Item_str_func
+class Item_func_str_to_date :public Item_temporal_func
 {
   enum_field_types cached_field_type;
-  date_time_format_types cached_format_type;
   timestamp_type cached_timestamp_type;
   bool const_item;
-  ulonglong sql_mode;
 public:
   Item_func_str_to_date(Item *a, Item *b)
-    :Item_str_func(a, b), const_item(false)
+    :Item_temporal_func(a, b), const_item(false)
   {}
-  String *val_str(String *str);
   bool get_date(MYSQL_TIME *ltime, uint fuzzy_date);
   const char *func_name() const { return "str_to_date"; }
   enum_field_types field_type() const { return cached_field_type; }
   void fix_length_and_dec();
-  Field *tmp_table_field(TABLE *table)
-  {
-    return tmp_table_field_from_field_type(table, 1);
-  }
-  longlong val_int();
-  bool result_as_longlong() { return TRUE; }
 };
 
 
-class Item_func_last_day :public Item_date
+class Item_func_last_day :public Item_datefunc
 {
 public:
-  Item_func_last_day(Item *a) :Item_date(a) {}
+  Item_func_last_day(Item *a) :Item_datefunc(a) {}
   const char *func_name() const { return "last_day"; }
   bool get_date(MYSQL_TIME *res, uint fuzzy_date);
-  void fix_length_and_dec()
-  { 
-    Item_date::fix_length_and_dec();
-    maybe_null= 1;
-  }
 };
 
 
diff --git a/sql/key.cc b/sql/key.cc
index 819b79879e8..02ef37ec1e5 100644
--- a/sql/key.cc
+++ b/sql/key.cc
@@ -106,30 +106,46 @@ int find_ref_key(KEY *key, uint key_count, uchar *record, Field *field,
   @param from_record full record to be copied from
   @param key_info    descriptor of the index
   @param key_length  specifies length of all keyparts that will be copied
+  @param with_zerofill  skipped bytes in the key buffer to be filled with 0
 */
 
 void key_copy(uchar *to_key, uchar *from_record, KEY *key_info,
-              uint key_length)
+              uint key_length, bool with_zerofill)
 {
   uint length;
   KEY_PART_INFO *key_part;
 
   if (key_length == 0)
     key_length= key_info->key_length;
-  for (key_part= key_info->key_part; (int) key_length > 0; key_part++)
+  for (key_part= key_info->key_part;
+       (int) key_length > 0;
+       key_part++, to_key+= length, key_length-= length)
   {
     if (key_part->null_bit)
     {
       *to_key++= test(from_record[key_part->null_offset] &
 		   key_part->null_bit);
       key_length--;
+      if (to_key[-1])
+      {
+        /*
+          Don't copy data for null values
+          The -1 below is to subtract the null byte which is already handled
+        */
+        length= min(key_length, (uint) key_part->store_length-1);
+        if (with_zerofill)
+          bzero((char*) to_key, length);
+        continue;
+      }
     }
     if (key_part->key_part_flag & HA_BLOB_PART ||
         key_part->key_part_flag & HA_VAR_LENGTH_PART)
     {
       key_length-= HA_KEY_BLOB_LENGTH;
       length= min(key_length, key_part->length);
-      key_part->field->get_key_image(to_key, length, Field::itRAW);
+      uint bytes= key_part->field->get_key_image(to_key, length, Field::itRAW);
+      if (with_zerofill && bytes < length)
+        bzero((char*) to_key + bytes, length - bytes);
       to_key+= HA_KEY_BLOB_LENGTH;
     }
     else
@@ -141,8 +157,6 @@ void key_copy(uchar *to_key, uchar *from_record, KEY *key_info,
       if (bytes < length)
         cs->cset->fill(cs, (char*) to_key + bytes, length - bytes, ' ');
     }
-    to_key+= length;
-    key_length-= length;
   }
 }
 
@@ -169,16 +183,28 @@ void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
   {
     key_length= key_info->key_length;
   }
-  for (key_part= key_info->key_part ; (int) key_length > 0 ; key_part++)
+  for (key_part= key_info->key_part ;
+       (int) key_length > 0 ;
+       key_part++, from_key+= length, key_length-= length)
   {
     uchar used_uneven_bits= 0;
     if (key_part->null_bit)
     {
-      if (*from_key++)
+      bool null_value; 
+      if ((null_value= *from_key++))
 	to_record[key_part->null_offset]|= key_part->null_bit;
       else
 	to_record[key_part->null_offset]&= ~key_part->null_bit;
       key_length--;
+      if (null_value)
+      {
+        /*
+          Don't copy data for null bytes
+          The -1 below is to subtract the null byte which is already handled
+        */
+        length= min(key_length, (uint) key_part->store_length-1);
+        continue;
+      }
     }
     if (key_part->type == HA_KEYTYPE_BIT)
     {
@@ -232,8 +258,6 @@ void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
       memcpy(to_record + key_part->offset, from_key + used_uneven_bits
              , (size_t) length - used_uneven_bits);
     }
-    from_key+= length;
-    key_length-= length;
   }
 }
 
@@ -578,3 +602,287 @@ next_loop:
   } while (key_info); /* no more keys to test */
   DBUG_RETURN(0);
 }
+
+
+/*
+  Compare two key tuples.
+
+  @brief
+    Compare two key tuples, i.e. two key values in KeyTupleFormat.
+
+  @param part          KEY_PART_INFO with key description
+  @param key1          First key to compare
+  @param key2          Second key to compare 
+  @param tuple_length  Length of key1 (and key2, they are the same) in bytes.
+
+  @return
+    @retval  0  key1 == key2
+    @retval -1  key1 < key2
+    @retval +1  key1 > key2 
+*/
+
+int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2, 
+                  uint tuple_length)
+{
+  uchar *key1_end= key1 + tuple_length;
+  int len;
+  int res;
+  LINT_INIT(len);
+  for (;key1 < key1_end; key1 += len, key2 += len, part++)
+  {
+    len= part->store_length;
+    if (part->null_bit)
+    {
+      if (*key1) // key1 == NULL
+      {
+        if (!*key2) // key1(NULL) < key2(notNULL)
+          return -1;
+        continue;
+      }
+      else if (*key2) // key1(notNULL) > key2 (NULL)
+        return 1;
+      /* Step over the NULL bytes for key_cmp() call */
+      key1++;
+      key2++;
+      len--;
+    }
+    if ((res= part->field->key_cmp(key1, key2)))
+      return res;
+  }
+  return 0;
+}
+
+
+/**
+  Get hash value for the key from a key buffer 
+
+  @param  key_info       the key descriptor
+  @param  used_key_part  number of key parts used for the key
+  @param  key            pointer to the buffer with the key value
+
+  @datails
+  When hashing we should take special care only of:
+  1. NULLs (and keyparts which can be null so one byte reserved for it);
+  2. Strings for which we have to take into account their collations
+  and the values of their lengths in the prefixes.
+
+  @return  hash value calculated for the key
+*/
+
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key)
+{
+  ulong nr=1, nr2=4;
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos= (uchar*)key;
+    CHARSET_INFO *cs;
+    uint length, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    LINT_INIT(length);
+    LINT_INIT(pack_length);
+
+    key+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key++;                       /* Skip null byte */
+      if (*pos)                    /* Found null */
+      {
+        nr^= (nr << 1) | 1;
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key+= 2;
+          break;
+        default:
+          ;
+        }
+    continue;
+      }
+      pos++;                       /* Skip null byte */
+    }
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length= uint2korr(pos);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length= my_charpos(cs, pos + pack_length,
+                                     pos + pack_length + length,
+                                     length / cs->mbmaxlen);
+        set_if_smaller(length, char_length);
+      }
+      cs->coll->hash_sort(cs, pos+pack_length, length, &nr, &nr2);
+      key+= pack_length;
+    }
+    else
+    {
+      for (; pos < (uchar*)key ; pos++)
+      {
+        nr^=(ulong) ((((uint) nr & 63)+nr2)*((uint) *pos)) + (nr << 8);
+        nr2+=3;
+      }
+    }
+  }
+  DBUG_PRINT("exit", ("hash: %lx", nr));
+  return(nr);
+}
+
+
+/**
+  Check whether two keys in the key buffers are equal
+
+  @param key_info        the key descriptor
+  @param  used_key_part  number of key parts used for the keys
+  @param key1            pointer to the buffer with the first key 
+  @param key2            pointer to the buffer with the second key 
+
+  @detail See details of key_hashnr().
+
+  @retval TRUE  keys in the buffers are NOT equal
+  @retval FALSE keys in the buffers are equal
+*/
+
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2)
+{
+  KEY_PART_INFO *key_part= key_info->key_part;
+  KEY_PART_INFO *end_key_part= key_part + used_key_parts;
+
+  for (; key_part < end_key_part; key_part++)
+  {
+    uchar *pos1= (uchar*)key1;
+    uchar *pos2= (uchar*)key2;
+    CHARSET_INFO *cs;
+    uint length1, length2, pack_length;
+    bool is_string= TRUE;
+    LINT_INIT(cs);
+    LINT_INIT(length1);
+    LINT_INIT(length2);
+    LINT_INIT(pack_length);
+
+    key1+= key_part->length;
+    key2+= key_part->length;
+    if (key_part->null_bit)
+    {
+      key1++; key2++;                           /* Skip null byte */
+      if (*pos1 && *pos2)                       /* Both are null */
+      {
+        /* Add key pack length to key for VARCHAR segments */
+        switch (key_part->type) {
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARBINARY1:
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY2:
+          key1+= 2; key2+= 2;
+          break;
+        default:
+          ;
+        }
+        continue;
+      }
+      if (*pos1 != *pos2)
+        return TRUE;
+      pos1++; pos2++;
+    }
+
+    /* If it is string set parameters of the string */
+    switch (key_part->type) {
+    case HA_KEYTYPE_TEXT:
+      cs= key_part->field->charset();
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_BINARY :
+      cs= &my_charset_bin;
+      length1= length2= key_part->length;
+      pack_length= 0;
+      break;
+    case HA_KEYTYPE_VARTEXT1:
+    case HA_KEYTYPE_VARTEXT2:
+      cs= key_part->field->charset();
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    case HA_KEYTYPE_VARBINARY1:
+    case HA_KEYTYPE_VARBINARY2:
+      cs= &my_charset_bin;
+      length1= uint2korr(pos1);
+      length2= uint2korr(pos2);
+      pack_length= 2;
+      break;
+    default:
+      is_string= FALSE;
+    }
+
+    if (is_string)
+    {
+      /*
+        Compare the strings taking into account length in characters
+        and collation
+      */
+      uint byte_len1= length1, byte_len2= length2;
+      if (cs->mbmaxlen > 1)
+      {
+        uint char_length1= my_charpos(cs, pos1 + pack_length,
+                                      pos1 + pack_length + length1,
+                                      length1 / cs->mbmaxlen);
+        uint char_length2= my_charpos(cs, pos2 + pack_length,
+                                      pos2 + pack_length + length2,
+                                      length2 / cs->mbmaxlen);
+        set_if_smaller(length1, char_length1);
+        set_if_smaller(length2, char_length2);
+      }
+      if (length1 != length2 ||
+          cs->coll->strnncollsp(cs,
+                                pos1 + pack_length, byte_len1,
+                                pos2 + pack_length, byte_len2,
+                                1))
+        return TRUE;
+      key1+= pack_length; key2+= pack_length;
+    }
+    else
+    {
+      /* it is OK to compare non-string byte per byte */
+      for (; pos1 < (uchar*)key1 ; pos1++, pos2++)
+      {
+        if (pos1[0] != pos2[0])
+          return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
diff --git a/sql/key.h b/sql/key.h
index 8b416da5846..93f2c07e17a 100644
--- a/sql/key.h
+++ b/sql/key.h
@@ -27,13 +27,18 @@ typedef struct st_key_part_info KEY_PART_INFO;
 
 int find_ref_key(KEY *key, uint key_count, uchar *record, Field *field,
                  uint *key_length, uint *keypart);
-void key_copy(uchar *to_key, uchar *from_record, KEY *key_info, uint key_length);
+void key_copy(uchar *to_key, uchar *from_record, KEY *key_info, uint key_length,
+              bool with_zerofill= FALSE);
 void key_restore(uchar *to_record, uchar *from_key, KEY *key_info,
                  uint key_length);
 bool key_cmp_if_same(TABLE *form,const uchar *key,uint index,uint key_length);
 void key_unpack(String *to,TABLE *form,uint index);
 bool is_key_used(TABLE *table, uint idx, const MY_BITMAP *fields);
 int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length);
+ulong key_hashnr(KEY *key_info, uint used_key_parts, const uchar *key);
+bool key_buf_cmp(KEY *key_info, uint used_key_parts,
+                 const uchar *key1, const uchar *key2);
 extern "C" int key_rec_cmp(void *key_info, uchar *a, uchar *b);
+int key_tuple_cmp(KEY_PART_INFO *part, uchar *key1, uchar *key2, uint tuple_length);
 
 #endif /* KEY_INCLUDED */
diff --git a/sql/lex.h b/sql/lex.h
index 982034c31af..37d0f0cc015 100644
--- a/sql/lex.h
+++ b/sql/lex.h
@@ -108,6 +108,7 @@ static SYMBOL symbols[] = {
   { "CHARACTER",	SYM(CHAR_SYM)},
   { "CHARSET",		SYM(CHARSET)},
   { "CHECK",		SYM(CHECK_SYM)},
+  { "CHECKPOINT",	SYM(CHECKPOINT_SYM)},
   { "CHECKSUM",		SYM(CHECKSUM_SYM)},
   { "CIPHER",		SYM(CIPHER_SYM)},
   { "CLASS_ORIGIN",     SYM(CLASS_ORIGIN_SYM)},
@@ -121,6 +122,12 @@ static SYMBOL symbols[] = {
   { "COLUMN",		SYM(COLUMN_SYM)},
   { "COLUMN_NAME",      SYM(COLUMN_NAME_SYM)},
   { "COLUMNS",		SYM(COLUMNS)},
+  { "COLUMN_ADD",       SYM(COLUMN_ADD_SYM)},
+  { "COLUMN_CREATE",    SYM(COLUMN_CREATE_SYM)},
+  { "COLUMN_DELETE",    SYM(COLUMN_DELETE_SYM)},
+  { "COLUMN_EXISTS",    SYM(COLUMN_EXISTS_SYM)},
+  { "COLUMN_GET",       SYM(COLUMN_GET_SYM)},
+  { "COLUMN_LIST",      SYM(COLUMN_LIST_SYM)},
   { "COMMENT",		SYM(COMMENT_SYM)},
   { "COMMIT",		SYM(COMMIT_SYM)},
   { "COMMITTED",	SYM(COMMITTED_SYM)},
@@ -390,6 +397,7 @@ static SYMBOL symbols[] = {
   { "ON",		SYM(ON)},
   { "ONE",              SYM(ONE_SYM)},
   { "ONE_SHOT",		SYM(ONE_SHOT_SYM)},
+  { "ONLINE",		SYM(ONLINE_SYM)},
   { "OPEN",		SYM(OPEN_SYM)},
   { "OPTIMIZE",		SYM(OPTIMIZE)},
   { "OPTIONS",		SYM(OPTIONS_SYM)},
diff --git a/sql/lock.cc b/sql/lock.cc
index e88fa252517..8b010968f5f 100644
--- a/sql/lock.cc
+++ b/sql/lock.cc
@@ -90,12 +90,6 @@
 
 extern HASH open_cache;
 
-/* flags for get_lock_data */
-#define GET_LOCK_UNLOCK         1
-#define GET_LOCK_STORE_LOCKS    2
-
-static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
-                                 uint flags);
 static int lock_external(THD *thd, TABLE **table,uint count);
 static int unlock_external(THD *thd, TABLE **table,uint count);
 static void print_lock_error(int error, const char *);
@@ -112,6 +106,7 @@ static int thr_lock_errno_to_mysql[]=
   @param flags Lock flags
   @return 0 if all the check passed, non zero if a check failed.
 */
+
 static int
 lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
 {
@@ -161,7 +156,7 @@ lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
 
       if (t->db_stat & HA_READ_ONLY)
       {
-        my_error(ER_OPEN_AS_READONLY, MYF(0), t->alias);
+        my_error(ER_OPEN_AS_READONLY, MYF(0), t->alias.c_ptr_safe());
         DBUG_RETURN(1);
       }
     }
@@ -217,7 +212,10 @@ lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
 /**
   Reset lock type in lock data
 
-  @param mysql_lock Lock structures to reset.
+  @param mysql_lock             Lock structures to reset.
+  @param unlock			If set, then set lock type to TL_UNLOCK,
+  				otherwise set to original lock type from
+				get_store_lock().
 
   @note After a locking error we want to quit the locking of the table(s).
         The test case in the bug report for Bug #18544 has the following
@@ -235,7 +233,7 @@ lock_tables_check(THD *thd, TABLE **tables, uint count, uint flags)
 */
 
 
-static void reset_lock_data(MYSQL_LOCK *sql_lock)
+void reset_lock_data(MYSQL_LOCK *sql_lock, bool unlock)
 {
   THR_LOCK_DATA **ldata, **ldata_end;
   DBUG_ENTER("reset_lock_data");
@@ -244,30 +242,12 @@ static void reset_lock_data(MYSQL_LOCK *sql_lock)
   for (ldata= sql_lock->locks, ldata_end= ldata + sql_lock->lock_count;
        ldata < ldata_end;
        ldata++)
-  {
-    /* Reset lock type. */
-    (*ldata)->type= TL_UNLOCK;
-  }
+    (*ldata)->type= unlock ? TL_UNLOCK : (*ldata)->org_type;
   DBUG_VOID_RETURN;
 }
 
 
 /**
-  Reset lock type in lock data and free.
-
-  @param mysql_lock Lock structures to reset.
-
-*/
-
-static void reset_lock_data_and_free(MYSQL_LOCK **mysql_lock)
-{
-  reset_lock_data(*mysql_lock);
-  my_free(*mysql_lock);
-  *mysql_lock= 0;
-}
-
-
-/**
    Lock tables.
 
    @param thd          The current thread.
@@ -283,12 +263,8 @@ static void reset_lock_data_and_free(MYSQL_LOCK **mysql_lock)
 
 MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **tables, uint count, uint flags)
 {
-  int rc;
   MYSQL_LOCK *sql_lock;
-  ulong timeout= (flags & MYSQL_LOCK_IGNORE_TIMEOUT) ?
-    LONG_TIMEOUT : thd->variables.lock_wait_timeout;
-
-  DBUG_ENTER("mysql_lock_tables");
+  DBUG_ENTER("mysql_lock_tables(tables)");
 
   if (lock_tables_check(thd, tables, count, flags))
     DBUG_RETURN(NULL);
@@ -296,15 +272,43 @@ MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **tables, uint count, uint flags)
   if (! (sql_lock= get_lock_data(thd, tables, count, GET_LOCK_STORE_LOCKS)))
     DBUG_RETURN(NULL);
 
+  if (mysql_lock_tables(thd, sql_lock, flags))
+  {
+    /* Clear the lock type of all lock data to avoid reusage. */
+    reset_lock_data(sql_lock, 1);
+    my_free(sql_lock);
+    sql_lock= 0;
+  }
+  DBUG_RETURN(sql_lock);
+}
+
+/**
+   Lock tables based on a MYSQL_LOCK structure.
+
+   mysql_lock_tables()
+
+   @param thd			The current thread.
+   @param sql_lock		Tables that should be locked
+   @param flags			See mysql_lock_tables() above
+
+   @return 0   ok
+   @return 1  error
+*/
+
+bool mysql_lock_tables(THD *thd, MYSQL_LOCK *sql_lock, uint flags)
+{
+  int rc= 1;
+  ulong timeout= (flags & MYSQL_LOCK_IGNORE_TIMEOUT) ?
+    LONG_TIMEOUT : thd->variables.lock_wait_timeout;
+
+  DBUG_ENTER("mysql_lock_tables(sql_lock)");
+
   thd_proc_info(thd, "System lock");
-  DBUG_PRINT("info", ("thd->proc_info %s", thd->proc_info));
   if (sql_lock->table_count && lock_external(thd, sql_lock->table,
                                              sql_lock->table_count))
-  {
-    /* Clear the lock type of all lock data to avoid reusage. */
-    reset_lock_data_and_free(&sql_lock);
     goto end;
-  }
+
+  thd_proc_info(thd, "Table lock");
 
   /* Copy the lock data array. thr_multi_lock() reorders its contents. */
   memcpy(sql_lock->locks + sql_lock->lock_count, sql_lock->locks,
@@ -314,29 +318,24 @@ MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **tables, uint count, uint flags)
                                                    sql_lock->lock_count,
                                                    sql_lock->lock_count,
                                                    &thd->lock_info, timeout)];
-  if (rc)
-  {
-    if (sql_lock->table_count)
-      (void) unlock_external(thd, sql_lock->table, sql_lock->table_count);
-    reset_lock_data_and_free(&sql_lock);
-    if (! thd->killed)
-      my_error(rc, MYF(0));
-  }
+  if (rc && sql_lock->table_count)
+    (void) unlock_external(thd, sql_lock->table, sql_lock->table_count);
+
 end:
   thd_proc_info(thd, 0);
 
   if (thd->killed)
   {
     thd->send_kill_message();
-    if (sql_lock)
-    {
-      mysql_unlock_tables(thd, sql_lock);
-      sql_lock= 0;
-    }
+    if (!rc)
+      mysql_unlock_tables(thd, sql_lock, 0);
+    rc= 1;
   }
+  else if (rc)
+    my_error(rc, MYF(0));
 
   thd->set_time_after_lock();
-  DBUG_RETURN(sql_lock);
+  DBUG_RETURN(rc);
 }
 
 
@@ -377,14 +376,15 @@ static int lock_external(THD *thd, TABLE **tables, uint count)
 }
 
 
-void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock)
+void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock, bool free_lock)
 {
   DBUG_ENTER("mysql_unlock_tables");
   if (sql_lock->table_count)
     unlock_external(thd, sql_lock->table, sql_lock->table_count);
   if (sql_lock->lock_count)
     thr_multi_unlock(sql_lock->locks, sql_lock->lock_count, 0);
-  my_free(sql_lock);
+  if (free_lock)
+    my_free(sql_lock);
   DBUG_VOID_RETURN;
 }
 
@@ -691,12 +691,11 @@ static int unlock_external(THD *thd, TABLE **table,uint count)
            - GET_LOCK_STORE_LOCKS : Store lock info in TABLE
 */
 
-static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
-                                 uint flags)
+MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count, uint flags)
 {
   uint i,tables,lock_count;
   MYSQL_LOCK *sql_lock;
-  THR_LOCK_DATA **locks, **locks_buf, **locks_start;
+  THR_LOCK_DATA **locks, **locks_buf;
   TABLE **to, **table_buf;
   DBUG_ENTER("get_lock_data");
 
@@ -734,7 +733,7 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
   {
     TABLE *table;
     enum thr_lock_type lock_type;
-    THR_LOCK_DATA **org_locks = locks;
+    THR_LOCK_DATA **locks_start;
 
     if ((table=table_ptr[i])->s->tmp_table == NON_TRANSACTIONAL_TMP_TABLE)
       continue;
@@ -752,8 +751,14 @@ static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count,
     }
     *to++= table;
     if (locks)
-      for ( ; org_locks != locks ; org_locks++)
-	(*org_locks)->debug_print_param= (void *) table;
+    {
+      for ( ; locks_start != locks ; locks_start++)
+      {
+	(*locks_start)->debug_print_param= (void *) table;
+	(*locks_start)->lock->name=         table->alias.c_ptr();
+	(*locks_start)->org_type=           (*locks_start)->type;
+      }
+    }
   }
   /*
     We do not use 'tables', because there are cases where store_lock()
diff --git a/sql/lock.h b/sql/lock.h
index 6f779595af8..442881c5b9b 100644
--- a/sql/lock.h
+++ b/sql/lock.h
@@ -12,7 +12,8 @@ typedef struct st_mysql_lock MYSQL_LOCK;
 
 
 MYSQL_LOCK *mysql_lock_tables(THD *thd, TABLE **table, uint count, uint flags);
-void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock);
+bool mysql_lock_tables(THD *thd, MYSQL_LOCK *sql_lock, uint flags);
+void mysql_unlock_tables(THD *thd, MYSQL_LOCK *sql_lock, bool free_lock= 1);
 void mysql_unlock_read_tables(THD *thd, MYSQL_LOCK *sql_lock);
 void mysql_unlock_some_tables(THD *thd, TABLE **table,uint count);
 void mysql_lock_remove(THD *thd, MYSQL_LOCK *locked,TABLE *table);
@@ -25,4 +26,11 @@ bool lock_schema_name(THD *thd, const char *db);
 bool lock_object_name(THD *thd, MDL_key::enum_mdl_namespace mdl_type,
                       const char *db, const char *name);
 
+/* flags for get_lock_data */
+#define GET_LOCK_UNLOCK         1
+#define GET_LOCK_STORE_LOCKS    2
+
+MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table_ptr, uint count, uint flags);
+void reset_lock_data(MYSQL_LOCK *sql_lock, bool unlock);
+
 #endif /* LOCK_INCLUDED */
diff --git a/sql/log.cc b/sql/log.cc
index e66ca32d560..2b248fa80ba 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2010-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -50,6 +51,7 @@
 
 #include "sql_plugin.h"
 #include "rpl_handler.h"
+#include "debug_sync.h"
 
 /* max size of the log message */
 #define MAX_LOG_BUFFER_SIZE 1024
@@ -71,6 +73,38 @@ static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
 static int binlog_commit(handlerton *hton, THD *thd, bool all);
 static int binlog_rollback(handlerton *hton, THD *thd, bool all);
 static int binlog_prepare(handlerton *hton, THD *thd, bool all);
+static int binlog_start_consistent_snapshot(handlerton *hton, THD *thd);
+
+static LEX_STRING const write_error_msg=
+    { C_STRING_WITH_LEN("error writing to the binary log") };
+
+static my_bool opt_optimize_thread_scheduling= TRUE;
+ulong binlog_checksum_options;
+#ifndef DBUG_OFF
+static ulong opt_binlog_dbug_fsync_sleep= 0;
+#endif
+
+mysql_mutex_t LOCK_prepare_ordered;
+mysql_mutex_t LOCK_commit_ordered;
+
+static ulonglong binlog_status_var_num_commits;
+static ulonglong binlog_status_var_num_group_commits;
+static char binlog_snapshot_file[FN_REFLEN];
+static ulonglong binlog_snapshot_position;
+
+static SHOW_VAR binlog_status_vars_detail[]=
+{
+  {"commits",
+    (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
+  {"group_commits",
+    (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
+  {"snapshot_file",
+    (char *)&binlog_snapshot_file, SHOW_CHAR},
+  {"snapshot_position",
+   (char *)&binlog_snapshot_position, SHOW_LONGLONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
 
 /**
    purge logs, master and slave sides both, related error code
@@ -148,59 +182,27 @@ sql_print_message_func sql_print_message_handlers[3] =
   sql_print_error
 };
 
-/**
-  Create the name of the log specified.
 
-  This method forms a new path + file name for the
-  log specified in @c name.
-
-  @param[IN] buff    Location for building new string.
-  @param[IN] name    Name of the log file.
-  @param[IN] log_ext The extension for the log (e.g. .log).
-
-  @returns Pointer to new string containing the name.
+/**
+  Create the name of the log file
+  
+  @param[OUT] out    a pointer to a new allocated name will go there
+  @param[IN] log_ext The extension for the file (e.g .log)
+  @param[IN] once    whether to use malloc_once or a normal malloc.
 */
-char *make_log_name(char *buff, const char *name, const char* log_ext)
+void make_default_log_name(char **out, const char* log_ext, bool once)
 {
-  strmake(buff, name, FN_REFLEN-5);
-  return fn_format(buff, buff, mysql_real_data_home, log_ext,
-                   MYF(MY_UNPACK_FILENAME|MY_REPLACE_EXT));
-}
-
-/*
-  Helper class to hold a mutex for the duration of the
-  block.
-
-  Eliminates the need for explicit unlocking of mutexes on, e.g.,
-  error returns.  On passing a null pointer, the sentry will not do
-  anything.
- */
-class Mutex_sentry
-{
-public:
-  Mutex_sentry(mysql_mutex_t *mutex)
-    : m_mutex(mutex)
-  {
-    if (m_mutex)
-      mysql_mutex_lock(mutex);
-  }
-
-  ~Mutex_sentry()
+  char buff[FN_REFLEN+10];
+  fn_format(buff, opt_log_basename, "", log_ext, MYF(MY_REPLACE_EXT));
+  if (once)
+    *out= my_once_strdup(buff, MYF(MY_WME));
+  else
   {
-    if (m_mutex)
-      mysql_mutex_unlock(m_mutex);
-#ifndef DBUG_OFF
-    m_mutex= 0;
-#endif
+    my_free(*out);
+    *out= my_strdup(buff, MYF(MY_WME));
   }
+}
 
-private:
-  mysql_mutex_t *m_mutex;
-
-  // It's not allowed to copy this object in any way
-  Mutex_sentry(Mutex_sentry const&);
-  void operator=(Mutex_sentry const&);
-};
 
 /*
   Helper classes to store non-transactional and transactional data
@@ -422,6 +424,7 @@ public:
                     ulong *param_ptr_binlog_stmt_cache_disk_use,
                     ulong *param_ptr_binlog_cache_use,
                     ulong *param_ptr_binlog_cache_disk_use)
+    : last_commit_pos_offset(0), using_xa(FALSE), xa_xid(0)
   {
      stmt_cache.set_binlog_cache_info(param_max_binlog_stmt_cache_size,
                                       param_ptr_binlog_stmt_cache_use,
@@ -429,11 +432,20 @@ public:
      trx_cache.set_binlog_cache_info(param_max_binlog_cache_size,
                                      param_ptr_binlog_cache_use,
                                      param_ptr_binlog_cache_disk_use);
+     last_commit_pos_file[0]= 0;
   }
 
-  void reset_cache(binlog_cache_data* cache_data)
+  void reset(bool do_stmt, bool do_trx)
   {
-    cache_data->reset();
+    if (do_stmt)
+      stmt_cache.reset();
+    if (do_trx)
+    {
+      trx_cache.reset();
+      using_xa= FALSE;
+      last_commit_pos_file[0]= 0;
+      last_commit_pos_offset= 0;
+    }
   }
 
   binlog_cache_data* get_binlog_cache_data(bool is_transactional)
@@ -450,6 +462,23 @@ public:
 
   binlog_cache_data trx_cache;
 
+  /*
+    Binlog position for current transaction.
+    For START TRANSACTION WITH CONSISTENT SNAPSHOT, this is the binlog
+    position corresponding to the snapshot taken. During (and after) commit,
+    this is set to the binlog position corresponding to just after the
+    commit (so storage engines can store it in their transaction log).
+  */
+  char last_commit_pos_file[FN_REFLEN];
+  my_off_t last_commit_pos_offset;
+
+  /*
+    Flag set true if this transaction is committed with log_xid() as part of
+    XA, false if not.
+  */
+  bool using_xa;
+  my_xid xa_xid;
+
 private:
 
   binlog_cache_mngr& operator=(const binlog_cache_mngr& info);
@@ -552,7 +581,7 @@ void Log_to_csv_event_handler::cleanup()
 */
 
 bool Log_to_csv_event_handler::
-  log_general(THD *thd, time_t event_time, const char *user_host,
+  log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
               uint user_host_len, int thread_id,
               const char *command_type, uint command_type_len,
               const char *sql_text, uint sql_text_len,
@@ -629,8 +658,8 @@ bool Log_to_csv_event_handler::
 
   DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
 
-  ((Field_timestamp*) table->field[0])->store_timestamp((my_time_t)
-                                                        event_time);
+  ((Field_timestamp*) table->field[0])->store_TIME(
+                  hrtime_to_my_time(event_time), hrtime_sec_part(event_time));
 
   /* do a write */
   if (table->field[1]->store(user_host, user_host_len, client_cs) ||
@@ -694,7 +723,6 @@ err:
     log_slow()
     thd               THD of the query
     current_time      current timestamp
-    query_start_arg   command start timestamp
     user_host         the pointer to the string with user@host info
     user_host_len     length of the user_host string. this is computed once
                       and passed to all general log event handlers
@@ -717,7 +745,7 @@ err:
 */
 
 bool Log_to_csv_event_handler::
-  log_slow(THD *thd, time_t current_time, time_t query_start_arg,
+  log_slow(THD *thd, my_hrtime_t current_time,
            const char *user_host, uint user_host_len,
            ulonglong query_utime, ulonglong lock_utime, bool is_command,
            const char *sql_text, uint sql_text_len)
@@ -731,6 +759,11 @@ bool Log_to_csv_event_handler::
   Open_tables_backup open_tables_backup;
   CHARSET_INFO *client_cs= thd->variables.character_set_client;
   bool save_time_zone_used;
+  long query_time= (long) min(query_utime/1000000, TIME_MAX_VALUE_SECONDS);
+  long lock_time=  (long) min(lock_utime/1000000, TIME_MAX_VALUE_SECONDS);
+  long query_time_micro= (long) (query_utime % 1000000);
+  long lock_time_micro=  (long) (lock_utime % 1000000);
+
   DBUG_ENTER("Log_to_csv_event_handler::log_slow");
 
   thd->push_internal_handler(& error_handler);
@@ -767,45 +800,34 @@ bool Log_to_csv_event_handler::
 
   /* store the time and user values */
   DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
-  ((Field_timestamp*) table->field[0])->store_timestamp((my_time_t)
-                                                        current_time);
+  ((Field_timestamp*) table->field[0])->store_TIME(
+             hrtime_to_my_time(current_time), hrtime_sec_part(current_time));
   if (table->field[1]->store(user_host, user_host_len, client_cs))
     goto err;
 
-  if (query_start_arg)
-  {
-    longlong query_time= (longlong) (query_utime/1000000);
-    longlong lock_time=  (longlong) (lock_utime/1000000);
-    /*
-      A TIME field can not hold the full longlong range; query_time or
-      lock_time may be truncated without warning here, if greater than
-      839 hours (~35 days)
-    */
-    MYSQL_TIME t;
-    t.neg= 0;
+  /*
+    A TIME field can not hold the full longlong range; query_time or
+    lock_time may be truncated without warning here, if greater than
+    839 hours (~35 days)
+  */
+  MYSQL_TIME t;
+  t.neg= 0;
+
+  /* fill in query_time field */
+  calc_time_from_sec(&t, query_time, query_time_micro);
+  if (table->field[2]->store_time(&t))
+    goto err;
+  /* lock_time */
+  calc_time_from_sec(&t, lock_time, lock_time_micro);
+  if (table->field[3]->store_time(&t))
+    goto err;
+  /* rows_sent */
+  if (table->field[4]->store((longlong) thd->sent_row_count, TRUE))
+    goto err;
+  /* rows_examined */
+  if (table->field[5]->store((longlong) thd->examined_row_count, TRUE))
+    goto err;
 
-    /* fill in query_time field */
-    calc_time_from_sec(&t, (long) min(query_time, (longlong) TIME_MAX_VALUE_SECONDS), 0);
-    if (table->field[2]->store_time(&t, MYSQL_TIMESTAMP_TIME))
-      goto err;
-    /* lock_time */
-    calc_time_from_sec(&t, (long) min(lock_time, (longlong) TIME_MAX_VALUE_SECONDS), 0);
-    if (table->field[3]->store_time(&t, MYSQL_TIMESTAMP_TIME))
-      goto err;
-    /* rows_sent */
-    if (table->field[4]->store((longlong) thd->sent_row_count, TRUE))
-      goto err;
-    /* rows_examined */
-    if (table->field[5]->store((longlong) thd->examined_row_count, TRUE))
-      goto err;
-  }
-  else
-  {
-    table->field[2]->set_null();
-    table->field[3]->set_null();
-    table->field[4]->set_null();
-    table->field[5]->set_null();
-  }
   /* fill database field */
   if (thd->db)
   {
@@ -937,14 +959,14 @@ void Log_to_file_event_handler::init_pthread_objects()
 /** Wrapper around MYSQL_LOG::write() for slow log. */
 
 bool Log_to_file_event_handler::
-  log_slow(THD *thd, time_t current_time, time_t query_start_arg,
+  log_slow(THD *thd, my_hrtime_t current_time,
            const char *user_host, uint user_host_len,
            ulonglong query_utime, ulonglong lock_utime, bool is_command,
            const char *sql_text, uint sql_text_len)
 {
   Silence_log_table_errors error_handler;
   thd->push_internal_handler(&error_handler);
-  bool retval= mysql_slow_log.write(thd, current_time, query_start_arg,
+  bool retval= mysql_slow_log.write(thd, hrtime_to_my_time(current_time),
                                     user_host, user_host_len,
                                     query_utime, lock_utime, is_command,
                                     sql_text, sql_text_len);
@@ -959,7 +981,7 @@ bool Log_to_file_event_handler::
 */
 
 bool Log_to_file_event_handler::
-  log_general(THD *thd, time_t event_time, const char *user_host,
+  log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
               uint user_host_len, int thread_id,
               const char *command_type, uint command_type_len,
               const char *sql_text, uint sql_text_len,
@@ -967,7 +989,8 @@ bool Log_to_file_event_handler::
 {
   Silence_log_table_errors error_handler;
   thd->push_internal_handler(&error_handler);
-  bool retval= mysql_log.write(event_time, user_host, user_host_len,
+  bool retval= mysql_log.write(hrtime_to_time(event_time), user_host,
+                               user_host_len,
                                thread_id, command_type, command_type_len,
                                sql_text, sql_text_len);
   thd->pop_internal_handler();
@@ -1200,8 +1223,6 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
 
   if (*slow_log_handler_list)
   {
-    time_t current_time;
-
     /* do not log slow queries from replication threads */
     if (thd->slave_thread && !opt_log_slow_slave_statements)
       return 0;
@@ -1221,16 +1242,12 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
                              sctx->ip ? sctx->ip : "", "]", NullS) -
                     user_host_buff);
 
-    current_time= my_time_possible_from_micro(current_utime);
-    if (thd->start_utime)
-    {
-      query_utime= (current_utime - thd->start_utime);
-      lock_utime=  (thd->utime_after_lock - thd->start_utime);
-    }
-    else
-    {
-      query_utime= lock_utime= 0;
-    }
+    DBUG_ASSERT(thd->start_utime);
+    DBUG_ASSERT(thd->start_time);
+    query_utime= (current_utime - thd->start_utime);
+    lock_utime=  (thd->utime_after_lock - thd->start_utime);
+    my_hrtime_t current_time= { hrtime_from_time(thd->start_time) +
+                                thd->start_time_sec_part + query_utime };
 
     if (!query)
     {
@@ -1251,7 +1268,7 @@ bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
     }
 
     for (current_handler= slow_log_handler_list; *current_handler ;)
-      error= (*current_handler++)->log_slow(thd, current_time, thd->start_time,
+      error= (*current_handler++)->log_slow(thd, current_time,
                                             user_host_buff, user_host_len,
                                             query_utime, lock_utime, is_command,
                                             query, query_length) || error;
@@ -1268,7 +1285,7 @@ bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
   Log_event_handler **current_handler= general_log_handler_list;
   char user_host_buff[MAX_USER_HOST_SIZE + 1];
   uint user_host_len= 0;
-  time_t current_time;
+  my_hrtime_t current_time;
 
   DBUG_ASSERT(thd);
 
@@ -1280,9 +1297,9 @@ bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
   }
   user_host_len= make_user_name(thd, user_host_buff);
 
-  current_time= my_time(0);
+  current_time= my_hrtime();
 
-  mysql_audit_general_log(thd, current_time,
+  mysql_audit_general_log(thd, hrtime_to_time(current_time),
                           user_host_buff, user_host_len,
                           command_name[(uint) command].str,
                           command_name[(uint) command].length,
@@ -1587,6 +1604,7 @@ int binlog_init(void *p)
   binlog_hton->commit= binlog_commit;
   binlog_hton->rollback= binlog_rollback;
   binlog_hton->prepare= binlog_prepare;
+  binlog_hton->start_consistent_snapshot= binlog_start_consistent_snapshot;
   binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
   return 0;
 }
@@ -1602,48 +1620,69 @@ static int binlog_close_connection(handlerton *hton, THD *thd)
   return 0;
 }
 
-/**
+/*
   This function flushes a cache upon commit/rollback.
 
-  @param thd                The thread whose transaction should be flushed
-  @param cache_data         Pointer to the cache
-  @param end_ev             The end event either commit/rollback
-  @param is_transactional   The type of the cache: transactional or
-                            non-transactional
+  SYNOPSIS
+    binlog_flush_cache()
 
-  @return
-    nonzero if an error pops up when flushing the cache.
-*/
-static inline int
-binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
-                   bool is_transactional)
+    thd        The thread whose transaction should be ended
+    cache_mngr Pointer to the binlog_cache_mngr to use
+    all        True if the entire transaction should be ended, false if
+               only the statement transaction should be ended.
+    end_ev     The end event to use (COMMIT, ROLLBACK, or commit XID)
+    using_stmt True if the statement cache should be flushed
+    using_trx  True if the transaction cache should be flushed
+
+  DESCRIPTION
+
+    End the currently transaction or statement. The transaction can be either
+    a real transaction or a statement transaction.
+
+    This can be to commit a transaction, with a COMMIT query event or an XA
+    commit XID event. But it can also be to rollback a transaction with a
+    ROLLBACK query event, used for rolling back transactions which also
+    contain updates to non-transactional tables. Or it can be a flush of
+    a statement cache.
+ */
+static int
+binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
+                   Log_event *end_ev, bool all, bool using_stmt,
+                   bool using_trx)
 {
-  DBUG_ENTER("binlog_flush_cache");
   int error= 0;
+  DBUG_ENTER("binlog_flush_cache");
 
-  if (!cache_data->empty())
+  if ((using_stmt && !cache_mngr->stmt_cache.empty()) ||
+      (using_trx && !cache_mngr->trx_cache.empty()))
   {
-    if (thd->binlog_flush_pending_rows_event(TRUE, is_transactional))
+    if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
       DBUG_RETURN(1);
+    if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
+      DBUG_RETURN(1);
+
     /*
       Doing a commit or a rollback including non-transactional tables,
       i.e., ending a transaction where we might write the transaction
       cache to the binary log.
 
       We can always end the statement when ending a transaction since
-      transactions are not allowed inside stored functions. If they
+      transactions are not allowed inside stored functions.  If they
       were, we would have to ensure that we're not ending a statement
       inside a stored function.
     */
-    error= mysql_bin_log.write(thd, &cache_data->cache_log, end_evt,
-                               cache_data->has_incident());
+    error= mysql_bin_log.write_transaction_to_binlog(thd, cache_mngr,
+                                                     end_ev, all,
+                                                     using_stmt, using_trx);
   }
-  cache_data->reset();
+  cache_mngr->reset(using_stmt, using_trx);
 
-  DBUG_ASSERT(cache_data->empty());
+  DBUG_ASSERT((!using_stmt || cache_mngr->stmt_cache.empty()) &&
+              (!using_trx || cache_mngr->trx_cache.empty()));
   DBUG_RETURN(error);
 }
 
+
 /**
   This function flushes the stmt-cache upon commit.
 
@@ -1654,13 +1693,12 @@ binlog_flush_cache(THD *thd, binlog_cache_data* cache_data, Log_event *end_evt,
     nonzero if an error pops up when flushing the cache.
 */
 static inline int
-binlog_commit_flush_stmt_cache(THD *thd,
+binlog_commit_flush_stmt_cache(THD *thd, bool all,
                                binlog_cache_mngr *cache_mngr)
 {
   Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
-                          FALSE, FALSE, TRUE, 0);
-  return (binlog_flush_cache(thd, &cache_mngr->stmt_cache, &end_evt,
-                             FALSE));
+                          FALSE, TRUE, TRUE, 0);
+  return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, FALSE));
 }
 
 /**
@@ -1673,12 +1711,11 @@ binlog_commit_flush_stmt_cache(THD *thd,
     nonzero if an error pops up when flushing the cache.
 */
 static inline int
-binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
+binlog_commit_flush_trx_cache(THD *thd, bool all, binlog_cache_mngr *cache_mngr)
 {
   Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
-                          TRUE, FALSE, TRUE, 0);
-  return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
-                             TRUE));
+                          TRUE, TRUE, TRUE, 0);
+  return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
 }
 
 /**
@@ -1691,12 +1728,12 @@ binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
     nonzero if an error pops up when flushing the cache.
 */
 static inline int
-binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
+binlog_rollback_flush_trx_cache(THD *thd, bool all,
+                                binlog_cache_mngr *cache_mngr)
 {
   Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
-                          TRUE, FALSE, TRUE, 0);
-  return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
-                             TRUE));
+                          TRUE, TRUE, TRUE, 0);
+  return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
 }
 
 /**
@@ -1710,12 +1747,26 @@ binlog_rollback_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr)
     nonzero if an error pops up when flushing the cache.
 */
 static inline int
-binlog_commit_flush_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr,
-                              my_xid xid)
+binlog_commit_flush_xid_caches(THD *thd, binlog_cache_mngr *cache_mngr,
+                               bool all, my_xid xid)
 {
-  Xid_log_event end_evt(thd, xid);
-  return (binlog_flush_cache(thd, &cache_mngr->trx_cache, &end_evt,
-                             TRUE));
+  if (xid)
+  {
+    Xid_log_event end_evt(thd, xid, TRUE);
+    return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
+  }
+  else
+  {
+    /*
+      Empty xid occurs in XA COMMIT ... ONE PHASE.
+      In this case, we do not have a MySQL xid for the transaction, and the
+      external XA transaction coordinator will have to handle recovery if
+      needed. So we end the transaction with a plain COMMIT query event.
+    */
+    Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
+                            TRUE, TRUE, TRUE, 0);
+    return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
+  }
 }
 
 /**
@@ -1754,11 +1805,11 @@ binlog_truncate_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
   if (ending_trans(thd, all))
   {
     if (cache_mngr->trx_cache.has_incident())
-      error= mysql_bin_log.write_incident(thd, TRUE);
+      error= mysql_bin_log.write_incident(thd);
 
     thd->clear_binlog_table_maps();
 
-    cache_mngr->reset_cache(&cache_mngr->trx_cache);
+    cache_mngr->reset(false, true);
   }
   /*
     If rolling back a statement in a transaction, we truncate the
@@ -1777,7 +1828,7 @@ static int binlog_prepare(handlerton *hton, THD *thd, bool all)
     do nothing.
     just pretend we can do 2pc, so that MySQL won't
     switch to 1pc.
-    real work will be done in MYSQL_BIN_LOG::log_xid()
+    real work will be done in MYSQL_BIN_LOG::log_and_order()
   */
   return 0;
 }
@@ -1810,7 +1861,7 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
 
   if (!cache_mngr->stmt_cache.empty())
   {
-    error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
+    error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
   }
 
   if (cache_mngr->trx_cache.empty())
@@ -1818,7 +1869,7 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
     /*
       we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
     */
-    cache_mngr->reset_cache(&cache_mngr->trx_cache);
+    cache_mngr->reset(false, true);
     DBUG_RETURN(error);
   }
 
@@ -1829,7 +1880,7 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
     Otherwise, we accumulate the changes.
   */
   if (!error && ending_trans(thd, all))
-    error= binlog_commit_flush_trx_cache(thd, cache_mngr);
+    error= binlog_commit_flush_trx_cache(thd, all, cache_mngr);
 
   /*
     This is part of the stmt rollback.
@@ -1868,12 +1919,12 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
   */
   if (cache_mngr->stmt_cache.has_incident())
   {
-    error= mysql_bin_log.write_incident(thd, TRUE);
-    cache_mngr->reset_cache(&cache_mngr->stmt_cache);
+    error= mysql_bin_log.write_incident(thd);
+    cache_mngr->reset(true, false);
   }
   else if (!cache_mngr->stmt_cache.empty())
   {
-    error= binlog_commit_flush_stmt_cache(thd, cache_mngr);
+    error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
   }
 
   if (cache_mngr->trx_cache.empty())
@@ -1881,7 +1932,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
     /*
       we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
     */
-    cache_mngr->reset_cache(&cache_mngr->trx_cache);
+    cache_mngr->reset(false, true);
     DBUG_RETURN(error);
   }
 
@@ -1921,7 +1972,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
          (trans_has_updated_non_trans_table(thd) &&
           ending_single_stmt_trans(thd,all) &&
           thd->variables.binlog_format == BINLOG_FORMAT_MIXED)))
-      error= binlog_rollback_flush_trx_cache(thd, cache_mngr);
+      error= binlog_rollback_flush_trx_cache(thd, all, cache_mngr);
     /*
       Truncate the cache if:
         . aborting a single or multi-statement transaction or;
@@ -2038,7 +2089,7 @@ static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
       log_query.append("`"))
     DBUG_RETURN(1);
   int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
-  Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
+  Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
                         TRUE, FALSE, TRUE, errcode);
   DBUG_RETURN(mysql_bin_log.write(&qinfo));
 }
@@ -2062,7 +2113,7 @@ static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
         log_query.append("`"))
       DBUG_RETURN(1);
     int errcode= query_error_code(thd, thd->killed == THD::NOT_KILLED);
-    Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
+    Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
                           TRUE, FALSE, TRUE, errcode);
     DBUG_RETURN(mysql_bin_log.write(&qinfo));
   }
@@ -2190,6 +2241,7 @@ static int find_uniq_filename(char *name)
   char			*start, *end;
   int                   error= 0;
   DBUG_ENTER("find_uniq_filename");
+  LINT_INIT(number);
 
   length= dirname_part(buff, name, &buf_length);
   start=  name + length;
@@ -2648,7 +2700,6 @@ err:
 
     thd               THD of the query
     current_time      current timestamp
-    query_start_arg   command start timestamp
     user_host         the pointer to the string with user@host info
     user_host_len     length of the user_host string. this is computed once
                       and passed to all general log event handlers
@@ -2670,7 +2721,7 @@ err:
 */
 
 bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
-                            time_t query_start_arg, const char *user_host,
+                            const char *user_host,
                             uint user_host_len, ulonglong query_utime,
                             ulonglong lock_utime, bool is_command,
                             const char *sql_text, uint sql_text_len)
@@ -2847,8 +2898,12 @@ const char *MYSQL_LOG::generate_name(const char *log_name,
 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
   :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
    need_start_event(TRUE),
+   group_commit_queue(0), group_commit_queue_busy(FALSE),
+   num_commits(0), num_group_commits(0),
    sync_period_ptr(sync_period),
    is_relay_log(0), signal_cnt(0),
+   checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
+   relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
    description_event_for_exec(0), description_event_for_queue(0)
 {
   /*
@@ -2896,7 +2951,9 @@ void MYSQL_BIN_LOG::init_pthread_objects()
 {
   MYSQL_LOG::init_pthread_objects();
   mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
+  mysql_mutex_setflags(&LOCK_index, MYF_NO_DEADLOCK_DETECTION);
   mysql_cond_init(m_key_update_cond, &update_cond, 0);
+  mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
 }
 
 
@@ -3081,7 +3138,19 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
         as we won't be able to reset it later
       */
       if (io_cache_type == WRITE_CACHE)
-        s.flags|= LOG_EVENT_BINLOG_IN_USE_F;
+        s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
+      s.checksum_alg= is_relay_log ?
+        /* relay-log */
+        /* inherit master's A descriptor if one has been received */
+        (relay_log_checksum_alg= 
+         (relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF) ?
+         relay_log_checksum_alg :
+         /* otherwise use slave's local preference of RL events verification */
+         (opt_slave_sql_verify_checksum == 0) ?
+         (uint8) BINLOG_CHECKSUM_ALG_OFF : (uint8) binlog_checksum_options):
+        /* binlog */
+        (uint8) binlog_checksum_options;
+      DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
       if (!s.is_valid())
         goto err;
       s.dont_set_created= null_created_arg;
@@ -3123,6 +3192,11 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
     if (flush_io_cache(&log_file) ||
         mysql_file_sync(log_file.file, MYF(MY_WME)))
       goto err;
+    mysql_mutex_lock(&LOCK_commit_ordered);
+    strmake(last_commit_pos_file, log_file_name,
+            sizeof(last_commit_pos_file)-1);
+    last_commit_pos_offset= my_b_tell(&log_file);
+    mysql_mutex_unlock(&LOCK_commit_ordered);
 
     if (write_file_name_to_index_file)
     {
@@ -3403,6 +3477,13 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
   ha_reset_logs(thd);
 
   /*
+    We need to get both locks to be sure that no one is trying to
+    write to the index log file.
+  */
+  mysql_mutex_lock(&LOCK_log);
+  mysql_mutex_lock(&LOCK_index);
+
+  /*
     The following mutex is needed to ensure that no threads call
     'delete thd' as we would then risk missing a 'rollback' from this
     thread. If the transaction involved MyISAM tables, it should go
@@ -3410,13 +3491,6 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
   */
   mysql_mutex_lock(&LOCK_thread_count);
 
-  /*
-    We need to get both locks to be sure that no one is trying to
-    write to the index log file.
-  */
-  mysql_mutex_lock(&LOCK_log);
-  mysql_mutex_lock(&LOCK_index);
-
   /* Save variables so that we can reopen the log */
   save_name=name;
   name=0;					// Protect against free
@@ -4257,8 +4331,15 @@ int MYSQL_BIN_LOG::new_file_impl(bool need_lock)
         We log the whole file name for log file as the user may decide
         to change base names at some point.
       */
-      Rotate_log_event r(new_name+dirname_length(new_name),
-                         0, LOG_EVENT_OFFSET, is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
+      Rotate_log_event r(new_name+dirname_length(new_name), 0, LOG_EVENT_OFFSET,
+                         is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
+      /* 
+         The current relay-log's closing Rotate event must have checksum
+         value computed with an algorithm of the last relay-logged FD event.
+      */
+      if (is_relay_log)
+        r.checksum_alg= relay_log_checksum_alg;
+      DBUG_ASSERT(!is_relay_log || relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
       if(DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event", (error=close_on_error=TRUE), FALSE) ||
          (error= r.write(&log_file)))
       {
@@ -4279,7 +4360,12 @@ int MYSQL_BIN_LOG::new_file_impl(bool need_lock)
   old_name=name;
   name=0;				// Don't free name
   close(LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX);
-
+  if (log_type == LOG_BIN && checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
+  {
+    DBUG_ASSERT(!is_relay_log);
+    DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
+    binlog_checksum_options= checksum_alg_reset;
+  }
   /*
      Note that at this point, log_state != LOG_CLOSED (important for is_open()).
   */
@@ -4421,6 +4507,10 @@ bool MYSQL_BIN_LOG::flush_and_sync(bool *synced)
     err= mysql_file_sync(fd, MYF(MY_WME));
     if (synced)
       *synced= 1;
+#ifndef DBUG_OFF
+    if (opt_binlog_dbug_fsync_sleep > 0)
+      my_sleep(opt_binlog_dbug_fsync_sleep);
+#endif
   }
   return err;
 }
@@ -4681,12 +4771,34 @@ void THD::binlog_set_stmt_begin() {
   cache_mngr->trx_cache.set_prev_position(pos);
 }
 
+static int
+binlog_start_consistent_snapshot(handlerton *hton, THD *thd)
+{
+  int err= 0;
+  DBUG_ENTER("binlog_start_consistent_snapshot");
+
+  thd->binlog_setup_trx_data();
+  binlog_cache_mngr *const cache_mngr=
+    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+
+  /* Server layer calls us with LOCK_commit_ordered locked, so this is safe. */
+  strmake(cache_mngr->last_commit_pos_file, mysql_bin_log.last_commit_pos_file,
+          sizeof(cache_mngr->last_commit_pos_file)-1);
+  cache_mngr->last_commit_pos_offset= mysql_bin_log.last_commit_pos_offset;
+
+  trans_register_ha(thd, TRUE, hton);
+
+  DBUG_RETURN(err);
+}
 
 /**
   This function writes a table map to the binary log. 
   Note that in order to keep the signature uniform with related methods,
   we use a redundant parameter to indicate whether a transactional table
   was changed or not.
+
+  If with_annotate != NULL and
+  *with_annotate = TRUE write also Annotate_rows before the table map.
  
   @param table             a pointer to the table.
   @param is_transactional  @c true indicates a transactional table,
@@ -4694,7 +4806,8 @@ void THD::binlog_set_stmt_begin() {
   @return
     nonzero if an error pops up when writing the table map event.
 */
-int THD::binlog_write_table_map(TABLE *table, bool is_transactional)
+int THD::binlog_write_table_map(TABLE *table, bool is_transactional,
+                                my_bool *with_annotate)
 {
   int error;
   DBUG_ENTER("THD::binlog_write_table_map");
@@ -4717,6 +4830,14 @@ int THD::binlog_write_table_map(TABLE *table, bool is_transactional)
 
   IO_CACHE *file=
     cache_mngr->get_binlog_cache_log(use_trans_cache(this, is_transactional));
+  if (with_annotate && *with_annotate)
+  {
+    Annotate_rows_log_event anno(current_thd, is_transactional);
+    /* Annotate event should be written not more than once */
+    *with_annotate= 0;
+    if ((error= anno.write(file)))
+      DBUG_RETURN(error);
+  }
   if ((error= the_event.write(file)))
     DBUG_RETURN(error);
 
@@ -4871,10 +4992,12 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
 }
 
 /**
-  Write an event to the binary log.
+  Write an event to the binary log. If with_annotate != NULL and
+  *with_annotate = TRUE write also Annotate_rows before the event
+  (this should happen only if the event is a Table_map).
 */
 
-bool MYSQL_BIN_LOG::write(Log_event *event_info)
+bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
 {
   THD *thd= event_info->thd;
   bool error= 1;
@@ -4962,9 +5085,22 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
        of the SQL command. If row-based binlogging, Insert_id, Rand
        and other kind of "setting context" events are not needed.
     */
+
+    if (with_annotate && *with_annotate)
+    {
+      DBUG_ASSERT(event_info->get_type_code() == TABLE_MAP_EVENT);
+      Annotate_rows_log_event anno(thd, event_info->cache_type);
+      /* Annotate event should be written not more than once */
+      *with_annotate= 0;
+      if (anno.write(file))
+        goto err;
+    }
+
+    if (thd)
     {
       if (!thd->is_current_stmt_binlog_format_row())
       {
+
         if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
         {
           Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
@@ -5026,6 +5162,8 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
 err:
     if (event_info->use_direct_logging())
     {
+      my_off_t offset= my_b_tell(file);
+
       if (!error)
       {
         bool synced;
@@ -5034,7 +5172,7 @@ err:
           goto unlock;
 
         status_var_add(thd->status_var.binlog_bytes_written,
-                       my_b_tell(file) - my_org_b_tell);
+                       offset - my_org_b_tell);
 
         if ((error= RUN_HOOK(binlog_storage, after_flush,
                  (thd, log_file_name, file->pos_in_file, synced))))
@@ -5045,7 +5183,15 @@ err:
         signal_update();
         rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
       }
+
 unlock:
+      /*
+        Take mutex to protect against a reader seeing partial writes of 64-bit
+        offset on 32-bit CPUs.
+      */
+      mysql_mutex_lock(&LOCK_commit_ordered);
+      last_commit_pos_offset= offset;
+      mysql_mutex_unlock(&LOCK_commit_ordered);
       mysql_mutex_unlock(&LOCK_log);
     }
 
@@ -5162,12 +5308,14 @@ int MYSQL_BIN_LOG::rotate_and_purge(uint flags)
          We give it a shot and try to write an incident event anyway
          to the current log. 
       */
-      if (!write_incident(current_thd, FALSE))
+      if (!write_incident_already_locked(current_thd))
         flush_and_sync(0);
 
 #ifdef HAVE_REPLICATION
     check_purge= true;
 #endif
+    if (flags & RP_BINLOG_CHECKSUM_ALG_CHANGE)
+      checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF; // done
   }
   if (!(flags & RP_LOCK_LOG_IS_ALREADY_LOCKED))
     mysql_mutex_unlock(&LOCK_log);
@@ -5196,6 +5344,33 @@ uint MYSQL_BIN_LOG::next_file_id()
 }
 
 
+/**
+  Calculate checksum of possibly a part of an event containing at least
+  the whole common header.
+
+  @param    buf       the pointer to trans cache's buffer
+  @param    off       the offset of the beginning of the event in the buffer
+  @param    event_len no-checksum length of the event
+  @param    length    the current size of the buffer
+
+  @param    crc       [in-out] the checksum
+
+  Event size in incremented by @c BINLOG_CHECKSUM_LEN.
+
+  @return 0 or number of unprocessed yet bytes of the event excluding 
+            the checksum part.
+*/
+  static ulong fix_log_event_crc(uchar *buf, uint off, uint event_len,
+                                 uint length, ha_checksum *crc)
+{
+  ulong ret;
+  uchar *event_begin= buf + off;
+
+  ret= length >= off + event_len ? 0 : off + event_len - length;
+  *crc= my_checksum(*crc, event_begin, event_len - ret); 
+  return ret;
+}
+
 /*
   Write the contents of a cache to the binary log.
 
@@ -5203,24 +5378,33 @@ uint MYSQL_BIN_LOG::next_file_id()
     write_cache()
     thd      Current_thread
     cache    Cache to write to the binary log
-    lock_log True if the LOCK_log mutex should be aquired, false otherwise
-    sync_log True if the log should be flushed and synced
 
   DESCRIPTION
     Write the contents of the cache to the binary log. The cache will
     be reset as a READ_CACHE to be able to read the contents from it.
- */
 
-int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
-                               bool sync_log)
-{
-  Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
+    Reading from the trans cache with possible (per @c binlog_checksum_options) 
+    adding checksum value  and then fixing the length and the end_log_pos of 
+    events prior to fill in the binlog cache.
+*/
 
+int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
+{
+  mysql_mutex_assert_owner(&LOCK_log);
   if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
     return ER_ERROR_ON_WRITE;
   uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
+  ulong remains= 0; // part of unprocessed yet netto length of the event
   long val;
+  ulong end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
   uchar header[LOG_EVENT_HEADER_LEN];
+  ha_checksum crc= 0, crc_0= 0; // assignments to keep compiler happy
+  my_bool do_checksum= (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF);
+  uchar buf[BINLOG_CHECKSUM_LEN];
+
+  // while there is just one alg the following must hold:
+  DBUG_ASSERT(!do_checksum ||
+              binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
 
   /*
     The events in the buffer have incorrect end_log_pos data
@@ -5238,6 +5422,8 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
 
   group= (uint)my_b_tell(&log_file);
   hdr_offs= carry= 0;
+  if (do_checksum)
+    crc= crc_0= my_checksum(0L, NULL, 0);
 
   do
   {
@@ -5250,12 +5436,21 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
       DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);
 
       /* assemble both halves */
-      memcpy(&header[carry], (char *)cache->read_pos, LOG_EVENT_HEADER_LEN - carry);
+      memcpy(&header[carry], (char *)cache->read_pos,
+             LOG_EVENT_HEADER_LEN - carry);
 
       /* fix end_log_pos */
-      val= uint4korr(&header[LOG_POS_OFFSET]) + group;
+      val= uint4korr(&header[LOG_POS_OFFSET]) + group +
+        (end_log_pos_inc+= (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
       int4store(&header[LOG_POS_OFFSET], val);
 
+      if (do_checksum)
+      {
+        ulong len= uint4korr(&header[EVENT_LEN_OFFSET]);
+        /* fix len */
+        int4store(&header[EVENT_LEN_OFFSET], len + BINLOG_CHECKSUM_LEN);
+      }
+
       /* write the first half of the split header */
       if (my_b_write(&log_file, header, carry))
         return ER_ERROR_ON_WRITE;
@@ -5265,11 +5460,20 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
         copy fixed second half of header to cache so the correct
         version will be written later.
       */
-      memcpy((char *)cache->read_pos, &header[carry], LOG_EVENT_HEADER_LEN - carry);
+      memcpy((char *)cache->read_pos, &header[carry],
+             LOG_EVENT_HEADER_LEN - carry);
 
       /* next event header at ... */
-      hdr_offs = uint4korr(&header[EVENT_LEN_OFFSET]) - carry;
+      hdr_offs= uint4korr(&header[EVENT_LEN_OFFSET]) - carry -
+        (do_checksum ? BINLOG_CHECKSUM_LEN : 0);
 
+      if (do_checksum)
+      {
+        DBUG_ASSERT(crc == crc_0 && remains == 0);
+        crc= my_checksum(crc, header, carry);
+        remains= uint4korr(header + EVENT_LEN_OFFSET) - carry -
+          BINLOG_CHECKSUM_LEN;
+      }
       carry= 0;
     }
 
@@ -5284,6 +5488,25 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
         very next iteration, just "eventually").
       */
 
+      /* crc-calc the whole buffer */
+      if (do_checksum && hdr_offs >= length)
+      {
+
+        DBUG_ASSERT(remains != 0 && crc != crc_0);
+
+        crc= my_checksum(crc, cache->read_pos, length); 
+        remains -= length;
+        if (my_b_write(&log_file, cache->read_pos, length))
+          return ER_ERROR_ON_WRITE;
+        if (remains == 0)
+        {
+          int4store(buf, crc);
+          if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+            return ER_ERROR_ON_WRITE;
+          crc= crc_0;
+        }
+      }
+
       while (hdr_offs < length)
       {
         /*
@@ -5291,6 +5514,26 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
           we get the rest.
         */
 
+        if (do_checksum)
+        {
+          if (remains != 0)
+          {
+            /*
+              finish off with remains of the last event that crawls
+              from previous into the current buffer
+            */
+            DBUG_ASSERT(crc != crc_0);
+            crc= my_checksum(crc, cache->read_pos, hdr_offs);
+            int4store(buf, crc);
+            remains -= hdr_offs;
+            DBUG_ASSERT(remains == 0);
+            if (my_b_write(&log_file, cache->read_pos, hdr_offs) ||
+                my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+              return ER_ERROR_ON_WRITE;
+            crc= crc_0;
+          }
+        }
+
         if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
         {
           carry= length - hdr_offs;
@@ -5300,17 +5543,38 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
         else
         {
           /* we've got a full event-header, and it came in one piece */
-
-          uchar *log_pos= (uchar *)cache->read_pos + hdr_offs + LOG_POS_OFFSET;
+          uchar *ev= (uchar *)cache->read_pos + hdr_offs;
+          uint event_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
+          uchar *log_pos= ev + LOG_POS_OFFSET;
 
           /* fix end_log_pos */
-          val= uint4korr(log_pos) + group;
+          val= uint4korr(log_pos) + group +
+            (end_log_pos_inc += (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
           int4store(log_pos, val);
 
+	  /* fix CRC */
+	  if (do_checksum)
+          {
+            /* fix length */
+            int4store(ev + EVENT_LEN_OFFSET, event_len + BINLOG_CHECKSUM_LEN);
+            remains= fix_log_event_crc(cache->read_pos, hdr_offs, event_len,
+                                       length, &crc);
+            if (my_b_write(&log_file, ev, 
+                           remains == 0 ? event_len : length - hdr_offs))
+              return ER_ERROR_ON_WRITE;
+            if (remains == 0)
+            {
+              int4store(buf, crc);
+              if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
+                return ER_ERROR_ON_WRITE;
+              crc= crc_0; // crc is complete
+            }
+          }
+
           /* next event header at ... */
-          log_pos= (uchar *)cache->read_pos + hdr_offs + EVENT_LEN_OFFSET;
-          hdr_offs += uint4korr(log_pos);
+          hdr_offs += event_len; // incr by the netto len
 
+          DBUG_ASSERT(!do_checksum || remains == 0 || hdr_offs >= length);
         }
       }
 
@@ -5326,17 +5590,19 @@ int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
     }
 
     /* Write data to the binary log file */
-    if (my_b_write(&log_file, cache->read_pos, length))
-      return ER_ERROR_ON_WRITE;
+    DBUG_EXECUTE_IF("fail_binlog_write_1",
+                    errno= 28; return ER_ERROR_ON_WRITE;);
+    if (!do_checksum)
+      if (my_b_write(&log_file, cache->read_pos, length))
+        return ER_ERROR_ON_WRITE;
     status_var_add(thd->status_var.binlog_bytes_written, length);
 
     cache->read_pos=cache->read_end;		// Mark buffer used up
   } while ((length= my_b_fill(cache)));
 
   DBUG_ASSERT(carry == 0);
-
-  if (sync_log)
-    return flush_and_sync(0);
+  DBUG_ASSERT(!do_checksum || remains == 0);
+  DBUG_ASSERT(!do_checksum || crc == crc_0);
 
   return 0;                                     // All OK
 }
@@ -5370,31 +5636,50 @@ int query_error_code(THD *thd, bool not_killed)
   return error;
 }
 
-bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
+
+bool MYSQL_BIN_LOG::write_incident_already_locked(THD *thd)
 {
   uint error= 0;
-  DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
-
-  if (!is_open())
-    DBUG_RETURN(error);
-
-  LEX_STRING const write_error_msg=
-    { C_STRING_WITH_LEN("error writing to the binary log") };
+  DBUG_ENTER("MYSQL_BIN_LOG::write_incident_already_locked");
   Incident incident= INCIDENT_LOST_EVENTS;
   Incident_log_event ev(thd, incident, write_error_msg);
-  if (lock)
-    mysql_mutex_lock(&LOCK_log);
-  error= ev.write(&log_file);
-  status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
-  if (lock)
+
+  if (likely(is_open()))
+  {
+    error= ev.write(&log_file);
+    status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+bool MYSQL_BIN_LOG::write_incident(THD *thd)
+{
+  uint error= 0;
+  my_off_t offset;
+  DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
+
+  mysql_mutex_lock(&LOCK_log);
+  if (likely(is_open()))
   {
-    if (!error && !(error= flush_and_sync(0)))
+    if (!(error= write_incident_already_locked(thd)) &&
+        !(error= flush_and_sync(0)))
     {
       signal_update();
       error= rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
     }
-    mysql_mutex_unlock(&LOCK_log);
+    offset= my_b_tell(&log_file);
+    /*
+      Take mutex to protect against a reader seeing partial writes of 64-bit
+      offset on 32-bit CPUs.
+    */
+    mysql_mutex_lock(&LOCK_commit_ordered);
+    last_commit_pos_offset= offset;
+    mysql_mutex_unlock(&LOCK_commit_ordered);
   }
+  mysql_mutex_unlock(&LOCK_log);
+
   DBUG_RETURN(error);
 }
 
@@ -5422,111 +5707,425 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
     'cache' needs to be reinitialized after this functions returns.
 */
 
-bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
-                          bool incident)
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
+                                           binlog_cache_mngr *cache_mngr,
+                                           Log_event *end_ev, bool all,
+                                           bool using_stmt_cache,
+                                           bool using_trx_cache)
+{
+  group_commit_entry entry;
+  DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
+
+  entry.thd= thd;
+  entry.cache_mngr= cache_mngr;
+  entry.error= 0;
+  entry.all= all;
+  entry.using_stmt_cache= using_stmt_cache;
+  entry.using_trx_cache= using_trx_cache;
+
+  /*
+    Log "BEGIN" at the beginning of every transaction.  Here, a transaction is
+    either a BEGIN..COMMIT block or a single statement in autocommit mode.
+
+    Create the necessary events here, where we have the correct THD (and
+    thread context).
+
+    Due to group commit the actual writing to binlog may happen in a different
+    thread.
+  */
+  Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), using_trx_cache, TRUE,
+                        TRUE, 0);
+  entry.begin_event= &qinfo;
+  entry.end_event= end_ev;
+  if (cache_mngr->stmt_cache.has_incident() ||
+      cache_mngr->trx_cache.has_incident())
+  {
+    Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
+    entry.incident_event= &inc_ev;
+    DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+  }
+  else
+  {
+    entry.incident_event= NULL;
+    DBUG_RETURN(write_transaction_to_binlog_events(&entry));
+  }
+}
+
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
 {
-  DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
+  /*
+    To facilitate group commit for the binlog, we first queue up ourselves in
+    the group commit queue. Then the first thread to enter the queue waits for
+    the LOCK_log mutex, and commits for everyone in the queue once it gets the
+    lock. Any other threads in the queue just wait for the first one to finish
+    the commit and wake them up.
+  */
+
+  entry->thd->clear_wakeup_ready();
+  mysql_mutex_lock(&LOCK_prepare_ordered);
+  group_commit_entry *orig_queue= group_commit_queue;
+  entry->next= orig_queue;
+  group_commit_queue= entry;
+
+  if (entry->cache_mngr->using_xa)
+  {
+    DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
+    run_prepare_ordered(entry->thd, entry->all);
+    DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
+  }
+  mysql_mutex_unlock(&LOCK_prepare_ordered);
+  DEBUG_SYNC(entry->thd, "commit_after_release_LOCK_prepare_ordered");
+
+  /*
+    The first in the queue handle group commit for all; the others just wait
+    to be signalled when group commit is done.
+  */
+  if (orig_queue != NULL)
+    entry->thd->wait_for_wakeup_ready();
+  else
+    trx_group_commit_leader(entry);
+
+  if (!opt_optimize_thread_scheduling)
+  {
+    /* For the leader, trx_group_commit_leader() already took the lock. */
+    if (orig_queue != NULL)
+      mysql_mutex_lock(&LOCK_commit_ordered);
+
+    DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
+    ++num_commits;
+    if (entry->cache_mngr->using_xa && !entry->error)
+      run_commit_ordered(entry->thd, entry->all);
+
+    group_commit_entry *next= entry->next;
+    if (!next)
+    {
+      group_commit_queue_busy= FALSE;
+      mysql_cond_signal(&COND_queue_busy);
+      DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
+    }
+    mysql_mutex_unlock(&LOCK_commit_ordered);
+
+    if (next)
+    {
+      next->thd->signal_wakeup_ready();
+    }
+  }
+
+  if (likely(!entry->error))
+    return 0;
+
+  switch (entry->error)
+  {
+  case ER_ERROR_ON_WRITE:
+    my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
+    break;
+  case ER_ERROR_ON_READ:
+    my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
+             entry->error_cache->file_name, entry->commit_errno);
+    break;
+  default:
+    /*
+      There are not (and should not be) any errors thrown not covered above.
+      But just in case one is added later without updating the above switch
+      statement, include a catch-all.
+    */
+    my_printf_error(entry->error,
+                    "Error writing transaction to binary log: %d",
+                    MYF(ME_NOREFRESH), entry->error);
+  }
+
+  /*
+    Since we return error, this transaction XID will not be committed, so
+    we need to mark it as not needed for recovery (unlog() is not called
+    for a transaction if log_xid() fails).
+  */
+  if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid)
+    mark_xid_done();
+
+  return 1;
+}
+
+/*
+  Do binlog group commit as the lead thread.
+
+  This must be called when this statement/transaction is queued at the start of
+  the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
+  commit all the transactions in the queue (more may have entered while waiting
+  for LOCK_log). After commit is done, all other threads in the queue will be
+  signalled.
+
+ */
+void
+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
+{
+  uint xid_count= 0;
+  my_off_t commit_offset;
+  group_commit_entry *current;
+  group_commit_entry *last_in_queue;
+  DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
+  LINT_INIT(commit_offset);
+
+  /*
+    Lock the LOCK_log(), and once we get it, collect any additional writes
+    that queued up while we were waiting.
+  */
   mysql_mutex_lock(&LOCK_log);
+  DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
+
+  mysql_mutex_lock(&LOCK_prepare_ordered);
+  current= group_commit_queue;
+  group_commit_queue= NULL;
+  mysql_mutex_unlock(&LOCK_prepare_ordered);
 
+  /* As the queue is in reverse order of entering, reverse it. */
+  group_commit_entry *queue= NULL;
+  last_in_queue= current;
+  while (current)
+  {
+    group_commit_entry *next= current->next;
+    current->next= queue;
+    queue= current;
+    current= next;
+  }
+  DBUG_ASSERT(leader == queue /* the leader should be first in queue */);
+
+  /* Now we have in queue the list of transactions to be committed in order. */
   DBUG_ASSERT(is_open());
   if (likely(is_open()))                       // Should always be true
   {
     /*
-      We only bother to write to the binary log if there is anything
-      to write.
-     */
-    if (my_b_tell(cache) > 0)
+      Commit every transaction in the queue.
+
+      Note that we are doing this in a different thread than the one running
+      the transaction! So we are limited in the operations we can do. In
+      particular, we cannot call my_error() on behalf of a transaction, as
+      that obtains the THD from thread local storage. Instead, we must set
+      current->error and let the thread do the error reporting itself once
+      we wake it up.
+    */
+    for (current= queue; current != NULL; current= current->next)
     {
+      binlog_cache_mngr *cache_mngr= current->cache_mngr;
+
       /*
-        Log "BEGIN" at the beginning of every transaction.  Here, a
-        transaction is either a BEGIN..COMMIT block or a single
-        statement in autocommit mode.
+        We already checked before that at least one cache is non-empty; if both
+        are empty we would have skipped calling into here.
       */
-      Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, FALSE, TRUE, 0);
-      if (qinfo.write(&log_file))
-        goto err;
+      DBUG_ASSERT(!cache_mngr->stmt_cache.empty() || !cache_mngr->trx_cache.empty());
 
-      status_var_add(thd->status_var.binlog_bytes_written, qinfo.data_written);
+      current->error= write_transaction_or_stmt(current);
 
-      DBUG_EXECUTE_IF("crash_before_writing_xid",
-                      {
-                        if ((write_error= write_cache(thd, cache, FALSE,
-                                                      TRUE)))
-                          DBUG_PRINT("info", ("error writing binlog cache: %d",
-                                               write_error));
-                        DBUG_PRINT("info", ("crashing before writing xid"));
-                        DBUG_SUICIDE();
-                      });
-
-      if ((write_error= write_cache(thd, cache, FALSE, FALSE)))
-        goto err;
+      strmake(cache_mngr->last_commit_pos_file, log_file_name,
+              sizeof(cache_mngr->last_commit_pos_file)-1);
+      commit_offset= my_b_write_tell(&log_file);
+      cache_mngr->last_commit_pos_offset= commit_offset;
+      if (cache_mngr->using_xa && cache_mngr->xa_xid)
+        xid_count++;
+    }
 
-      if (commit_event)
+    bool synced= 0;
+    if (flush_and_sync(&synced))
+    {
+      for (current= queue; current != NULL; current= current->next)
       {
-        if (commit_event->write(&log_file))
-          goto err;
-        status_var_add(thd->status_var.binlog_bytes_written,
-                       commit_event->data_written);
+        if (!current->error)
+        {
+          current->error= ER_ERROR_ON_WRITE;
+          current->commit_errno= errno;
+          current->error_cache= NULL;
+        }
       }
-
-      if (incident && write_incident(thd, FALSE))
-        goto err;
-
-      bool synced= 0;
-      if (flush_and_sync(&synced))
-        goto err;
-      DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_SUICIDE(););
-      if (cache->error)				// Error on read
+    }
+    else
+    {
+      bool any_error= false;
+      bool all_error= true;
+      for (current= queue; current != NULL; current= current->next)
       {
-        sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
-        write_error=1;				// Don't give more errors
-        goto err;
+        if (!current->error &&
+            RUN_HOOK(binlog_storage, after_flush,
+                (current->thd, log_file_name, log_file.pos_in_file, synced)))
+        {
+          current->error= ER_ERROR_ON_WRITE;
+          current->commit_errno= -1;
+          current->error_cache= NULL;
+          any_error= true;
+        }
+        else
+          all_error= false;
       }
 
-      if (RUN_HOOK(binlog_storage, after_flush,
-                   (thd, log_file_name, log_file.pos_in_file, synced)))
-      {
+      if (any_error)
         sql_print_error("Failed to run 'after_flush' hooks");
-        write_error=1;
-        goto err;
-      }
-
-      signal_update();
+      if (!all_error)
+        signal_update();
     }
 
     /*
-      if commit_event is Xid_log_event, increase the number of
-      prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
+      if any commit_events are Xid_log_event, increase the number of
+      prepared_xids (it's decreased in ::unlog()). Binlog cannot be rotated
       if there're prepared xids in it - see the comment in new_file() for
       an explanation.
-      If the commit_event is not Xid_log_event (then it's a Query_log_event)
-      rotate binlog, if necessary.
+      If no Xid_log_events (then it's all Query_log_event) rotate binlog,
+      if necessary.
     */
-    if (commit_event && commit_event->get_type_code() == XID_EVENT)
+    if (xid_count > 0)
     {
-      mysql_mutex_lock(&LOCK_prep_xids);
-      prepared_xids++;
-      mysql_mutex_unlock(&LOCK_prep_xids);
+      mark_xids_active(xid_count);
     }
     else
+    {
       if (rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED))
-        goto err;
+      {
+        /*
+          If we fail to rotate, which thread should get the error?
+          We give the error to the *last* transaction thread; that seems to
+          make the most sense, as it was the last to write to the log.
+        */
+        last_in_queue->error= ER_ERROR_ON_WRITE;
+        last_in_queue->commit_errno= errno;
+      }
+    }
   }
+
+  DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
+  mysql_mutex_lock(&LOCK_commit_ordered);
+  last_commit_pos_offset= commit_offset;
+  /*
+    We cannot unlock LOCK_log until we have locked LOCK_commit_ordered;
+    otherwise scheduling could allow the next group commit to run ahead of us,
+    messing up the order of commit_ordered() calls. But as soon as
+    LOCK_commit_ordered is obtained, we can let the next group commit start.
+  */
   mysql_mutex_unlock(&LOCK_log);
+  DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
+  ++num_group_commits;
 
-  DBUG_RETURN(0);
+  if (!opt_optimize_thread_scheduling)
+  {
+    /*
+      If we want to run commit_ordered() each in the transaction's own thread
+      context, then we need to mark the queue reserved; we need to finish all
+      threads in one group commit before the next group commit can be allowed
+      to proceed, and we cannot unlock a simple pthreads mutex in a different
+      thread from the one that locked it.
+    */
 
-err:
-  if (!write_error)
+    while (group_commit_queue_busy)
+      mysql_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
+    group_commit_queue_busy= TRUE;
+
+    /* Note that we return with LOCK_commit_ordered locked! */
+    DBUG_VOID_RETURN;
+  }
+
+  /*
+    Wakeup each participant waiting for our group commit, first calling the
+    commit_ordered() methods for any transactions doing 2-phase commit.
+  */
+  current= queue;
+  while (current != NULL)
   {
-    write_error= 1;
-    sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
+    group_commit_entry *next;
+
+    DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
+    ++num_commits;
+    if (current->cache_mngr->using_xa && !current->error)
+      run_commit_ordered(current->thd, current->all);
+
+    /*
+      Careful not to access current->next after waking up the other thread! As
+      it may change immediately after wakeup.
+    */
+    next= current->next;
+    if (current != leader)                      // Don't wake up ourself
+      current->thd->signal_wakeup_ready();
+    current= next;
   }
-  mysql_mutex_unlock(&LOCK_log);
-  DBUG_RETURN(1);
+  DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
+  mysql_mutex_unlock(&LOCK_commit_ordered);
+
+  DBUG_VOID_RETURN;
 }
 
 
+int
+MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry)
+{
+  binlog_cache_mngr *mngr= entry->cache_mngr;
+
+  if (entry->begin_event->write(&log_file))
+    return ER_ERROR_ON_WRITE;
+  status_var_add(entry->thd->status_var.binlog_bytes_written,
+                 entry->begin_event->data_written);
+
+  if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
+      write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
+  {
+    entry->error_cache= &mngr->stmt_cache.cache_log;
+    entry->commit_errno= errno;
+    return ER_ERROR_ON_WRITE;
+  }
+
+  if (entry->using_trx_cache && !mngr->trx_cache.empty())
+  {
+    DBUG_EXECUTE_IF("crash_before_writing_xid",
+                    {
+                      if ((write_cache(entry->thd,
+                                       mngr->get_binlog_cache_log(TRUE))))
+                        DBUG_PRINT("info", ("error writing binlog cache"));
+                      else
+                        flush_and_sync(0);
+
+                      DBUG_PRINT("info", ("crashing before writing xid"));
+                      DBUG_SUICIDE();
+                    });
+
+    if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
+    {
+      entry->error_cache= &mngr->trx_cache.cache_log;
+      entry->commit_errno= errno;
+      return ER_ERROR_ON_WRITE;
+    }
+  }
+
+  if (entry->end_event->write(&log_file))
+  {
+    entry->error_cache= NULL;
+    entry->commit_errno= errno;
+    return ER_ERROR_ON_WRITE;
+  }
+  status_var_add(entry->thd->status_var.binlog_bytes_written,
+                 entry->end_event->data_written);
+
+  if (entry->incident_event)
+  {
+    if (entry->incident_event->write(&log_file))
+    {
+      entry->error_cache= NULL;
+      entry->commit_errno= errno;
+      return ER_ERROR_ON_WRITE;
+    }
+  }
+
+  if (mngr->get_binlog_cache_log(FALSE)->error) // Error on read
+  {
+    entry->error_cache= &mngr->stmt_cache.cache_log;
+    entry->commit_errno= errno;
+    return ER_ERROR_ON_READ;
+  }
+  if (mngr->get_binlog_cache_log(TRUE)->error)  // Error on read
+  {
+    entry->error_cache= &mngr->trx_cache.cache_log;
+    entry->commit_errno= errno;
+    return ER_ERROR_ON_READ;
+  }
+
+  return 0;
+}
+
 /**
   Wait until we get a signal that the relay log has been updated.
 
@@ -5608,6 +6207,11 @@ void MYSQL_BIN_LOG::close(uint exiting)
 	(exiting & LOG_CLOSE_STOP_EVENT))
     {
       Stop_log_event s;
+      // the checksumming rule for relay-log case is similar to Rotate
+        s.checksum_alg= is_relay_log ?
+          (uint8) relay_log_checksum_alg : (uint8) binlog_checksum_options;
+      DBUG_ASSERT(!is_relay_log ||
+                  relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
       s.write(&log_file);
       bytes_written+= s.data_written;
       signal_update();
@@ -5724,9 +6328,23 @@ static bool test_if_number(register const char *str,
 
 void sql_perror(const char *message)
 {
-#ifdef HAVE_STRERROR
+#if defined(_WIN32)
+  char* buf;
+  DWORD dw= GetLastError();
+  if (FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |  FORMAT_MESSAGE_FROM_SYSTEM |
+        FORMAT_MESSAGE_IGNORE_INSERTS,  NULL, dw,
+        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL ) > 0)
+  {
+    sql_print_error("%s: %s",message, buf);
+    LocalFree((HLOCAL)buf);
+  }
+  else
+  {
+    sql_print_error("%s", message);
+  }
+#elif defined(HAVE_STRERROR)
   sql_print_error("%s: %s",message, strerror(errno));
-#else
+#else 
   perror(message);
 #endif
 }
@@ -5931,6 +6549,148 @@ void sql_print_information(const char *format, ...)
 }
 
 
+void
+TC_LOG::run_prepare_ordered(THD *thd, bool all)
+{
+  Ha_trx_info *ha_info=
+    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+  mysql_mutex_assert_owner(&LOCK_prepare_ordered);
+  for (; ha_info; ha_info= ha_info->next())
+  {
+    handlerton *ht= ha_info->ht();
+    if (!ht->prepare_ordered)
+      continue;
+    ht->prepare_ordered(ht, thd, all);
+  }
+}
+
+
+void
+TC_LOG::run_commit_ordered(THD *thd, bool all)
+{
+  Ha_trx_info *ha_info=
+    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+  mysql_mutex_assert_owner(&LOCK_commit_ordered);
+  for (; ha_info; ha_info= ha_info->next())
+  {
+    handlerton *ht= ha_info->ht();
+    if (!ht->commit_ordered)
+      continue;
+    ht->commit_ordered(ht, thd, all);
+    DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
+  }
+}
+
+
+int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
+                               bool need_prepare_ordered,
+                               bool need_commit_ordered)
+{
+  int cookie;
+  struct commit_entry entry;
+  bool is_group_commit_leader;
+  LINT_INIT(is_group_commit_leader);
+
+  if (need_prepare_ordered)
+  {
+    mysql_mutex_lock(&LOCK_prepare_ordered);
+    run_prepare_ordered(thd, all);
+    if (need_commit_ordered)
+    {
+      /*
+        Must put us in queue so we can run_commit_ordered() in same sequence
+        as we did run_prepare_ordered().
+      */
+      thd->clear_wakeup_ready();
+      entry.thd= thd;
+      commit_entry *previous_queue= commit_ordered_queue;
+      entry.next= previous_queue;
+      commit_ordered_queue= &entry;
+      is_group_commit_leader= (previous_queue == NULL);
+    }
+    mysql_mutex_unlock(&LOCK_prepare_ordered);
+  }
+
+  cookie= 0;
+  if (xid)
+    cookie= log_one_transaction(xid);
+
+  if (need_commit_ordered)
+  {
+    if (need_prepare_ordered)
+    {
+      /*
+        We did the run_prepare_ordered() serialised, then ran the log_xid() in
+        parallel. Now we have to do run_commit_ordered() serialised in the
+        same sequence as run_prepare_ordered().
+
+        We do this starting from the head of the queue, each thread doing
+        run_commit_ordered() and signalling the next in queue.
+      */
+      if (is_group_commit_leader)
+      {
+        /* The first in queue starts the ball rolling. */
+        mysql_mutex_lock(&LOCK_prepare_ordered);
+        while (commit_ordered_queue_busy)
+          mysql_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
+        commit_entry *queue= commit_ordered_queue;
+        commit_ordered_queue= NULL;
+        /*
+          Mark the queue busy while we bounce it from one thread to the
+          next.
+        */
+        commit_ordered_queue_busy= true;
+        mysql_mutex_unlock(&LOCK_prepare_ordered);
+
+        /* Reverse the queue list so we get correct order. */
+        commit_entry *prev= NULL;
+        while (queue)
+        {
+          commit_entry *next= queue->next;
+          queue->next= prev;
+          prev= queue;
+          queue= next;
+        }
+        DBUG_ASSERT(prev == &entry && prev->thd == thd);
+      }
+      else
+      {
+        /* Not first in queue; just wait until previous thread wakes us up. */
+        thd->wait_for_wakeup_ready();
+      }
+    }
+
+    /* Only run commit_ordered() if log_xid was successful. */
+    if (cookie)
+    {
+      mysql_mutex_lock(&LOCK_commit_ordered);
+      run_commit_ordered(thd, all);
+      mysql_mutex_unlock(&LOCK_commit_ordered);
+    }
+
+    if (need_prepare_ordered)
+    {
+      commit_entry *next= entry.next;
+      if (next)
+      {
+        next->thd->signal_wakeup_ready();
+      }
+      else
+      {
+        mysql_mutex_lock(&LOCK_prepare_ordered);
+        commit_ordered_queue_busy= false;
+        mysql_cond_signal(&COND_queue_busy);
+        mysql_mutex_unlock(&LOCK_prepare_ordered);
+      }
+    }
+  }
+
+  return cookie;
+}
+
+
 /********* transaction coordinator log for 2pc - mmap() based solution *******/
 
 /*
@@ -6068,6 +6828,7 @@ int TC_LOG_MMAP::open(const char *opt_name)
   mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
   mysql_cond_init(key_COND_active, &COND_active, 0);
   mysql_cond_init(key_COND_pool, &COND_pool, 0);
+  mysql_cond_init(key_COND_queue_busy, &COND_queue_busy, 0);
 
   inited=6;
 
@@ -6075,6 +6836,8 @@ int TC_LOG_MMAP::open(const char *opt_name)
   active=pages;
   pool=pages+1;
   pool_last=pages+npages-1;
+  commit_ordered_queue= NULL;
+  commit_ordered_queue_busy= false;
 
   return 0;
 
@@ -6180,7 +6943,7 @@ int TC_LOG_MMAP::overflow()
     to the position in memory where xid was logged to.
 */
 
-int TC_LOG_MMAP::log_xid(THD *thd, my_xid xid)
+int TC_LOG_MMAP::log_one_transaction(my_xid xid)
 {
   int err;
   PAGE *p;
@@ -6350,6 +7113,8 @@ void TC_LOG_MMAP::close()
     mysql_mutex_destroy(&LOCK_active);
     mysql_mutex_destroy(&LOCK_pool);
     mysql_cond_destroy(&COND_pool);
+    mysql_cond_destroy(&COND_active);
+    mysql_cond_destroy(&COND_queue_busy);
   case 5:
     data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
   case 4:
@@ -6529,7 +7294,8 @@ int TC_LOG_BINLOG::open(const char *opt_name)
       goto err;
     }
 
-    if ((ev= Log_event::read_log_event(&log, 0, &fdle)) &&
+    if ((ev= Log_event::read_log_event(&log, 0, &fdle,
+                                       opt_master_verify_checksum)) &&
         ev->get_type_code() == FORMAT_DESCRIPTION_EVENT &&
         ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
     {
@@ -6559,42 +7325,86 @@ void TC_LOG_BINLOG::close()
   mysql_cond_destroy(&COND_prep_xids);
 }
 
-/**
-  @todo
-  group commit
-
-  @retval
-    0    error
-  @retval
-    1    success
+/*
+  Do a binlog log_xid() for a group of transactions, linked through
+  thd->next_commit_ordered.
 */
-int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
+int
+TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
+                             bool need_prepare_ordered __attribute__((unused)),
+                             bool need_commit_ordered __attribute__((unused)))
 {
-  DBUG_ENTER("TC_LOG_BINLOG::log");
+  int err;
+  DBUG_ENTER("TC_LOG_BINLOG::log_and_order");
+
   binlog_cache_mngr *cache_mngr=
     (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
-  /*
-    We always commit the entire transaction when writing an XID. Also
-    note that the return value is inverted.
-   */
-  DBUG_RETURN(!binlog_commit_flush_stmt_cache(thd, cache_mngr) &&
-              !binlog_commit_flush_trx_cache(thd, cache_mngr, xid));
+
+  cache_mngr->using_xa= TRUE;
+  cache_mngr->xa_xid= xid;
+  err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
+
+  DEBUG_SYNC(thd, "binlog_after_log_and_order");
+
+  DBUG_RETURN(!err);
 }
 
-int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+/*
+  After an XID is logged, we need to hold on to the current binlog file until
+  it is fully committed in the storage engine. The reason is that crash
+  recovery only looks at the latest binlog, so we must make sure there are no
+  outstanding prepared (but not committed) transactions before rotating the
+  binlog.
+
+  To handle this, we keep a count of outstanding XIDs. This function is used
+  to increase this count when committing one or more transactions to the
+  binary log.
+*/
+void
+TC_LOG_BINLOG::mark_xids_active(uint xid_count)
 {
-  DBUG_ENTER("TC_LOG_BINLOG::unlog");
+  DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
+  DBUG_PRINT("info", ("xid_count=%u", xid_count));
+  mysql_mutex_lock(&LOCK_prep_xids);
+  prepared_xids+= xid_count;
+  mysql_mutex_unlock(&LOCK_prep_xids);
+  DBUG_VOID_RETURN;
+}
+
+/*
+  Once an XID is committed, it is safe to rotate the binary log, as it can no
+  longer be needed during crash recovery.
+
+  This function is called to mark an XID this way. It needs to decrease the
+  count of pending XIDs, and signal the log rotator thread when it reaches zero.
+*/
+void
+TC_LOG_BINLOG::mark_xid_done()
+{
+  my_bool send_signal;
+
+  DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
   mysql_mutex_lock(&LOCK_prep_xids);
   // prepared_xids can be 0 if the transaction had ignorable errors.
   DBUG_ASSERT(prepared_xids >= 0);
   if (prepared_xids > 0)
     prepared_xids--;
-  if (prepared_xids == 0) {
+  send_signal= (prepared_xids == 0);
+  mysql_mutex_unlock(&LOCK_prep_xids);
+  if (send_signal) {
     DBUG_PRINT("info", ("prepared_xids=%lu", prepared_xids));
     mysql_cond_signal(&COND_prep_xids);
   }
-  mysql_mutex_unlock(&LOCK_prep_xids);
-  DBUG_RETURN(rotate_and_purge(0));     // as ::write() did not rotate
+  DBUG_VOID_RETURN;
+}
+
+int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+{
+  DBUG_ENTER("TC_LOG_BINLOG::unlog");
+  if (xid)
+    mark_xid_done();
+  /* As ::write_transaction_to_binlog() did not rotate, do it here. */
+  DBUG_RETURN(rotate_and_purge(0));
 }
 
 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
@@ -6612,7 +7422,9 @@ int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
 
   fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error
 
-  while ((ev= Log_event::read_log_event(log,0,fdle)) && ev->is_valid())
+  while ((ev= Log_event::read_log_event(log, 0, fdle,
+                                        opt_master_verify_checksum))
+         && ev->is_valid())
   {
     if (ev->get_type_code() == XID_EVENT)
     {
@@ -6663,9 +7475,170 @@ ulonglong mysql_bin_log_file_pos(void)
 {
   return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
 }
+/*
+  Get the current position of the MySQL binlog for transaction currently being
+  committed.
+
+  This is valid to call from within storage engine commit_ordered() and
+  commit() methods only.
+
+  Since it stores the position inside THD, it is safe to call without any
+  locking.
+*/
+void
+mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
+{
+  binlog_cache_mngr *cache_mngr;
+  if (opt_bin_log &&
+      (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
+  {
+    *out_file= cache_mngr->last_commit_pos_file;
+    *out_pos= (ulonglong)(cache_mngr->last_commit_pos_offset);
+  }
+  else
+  {
+    *out_file= NULL;
+    *out_pos= 0;
+  }
+}
 #endif /* INNODB_COMPATIBILITY_HOOKS */
 
 
+static void
+binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var,
+                       void *var_ptr, const void *save)
+{
+  ulong value=  *((ulong *)save);
+
+  mysql_mutex_lock(mysql_bin_log.get_log_lock());
+  if(mysql_bin_log.is_open())
+  {
+    uint flags= RP_FORCE_ROTATE | RP_LOCK_LOG_IS_ALREADY_LOCKED |
+      (binlog_checksum_options != (uint) value?
+       RP_BINLOG_CHECKSUM_ALG_CHANGE : 0);
+    if (flags & RP_BINLOG_CHECKSUM_ALG_CHANGE)
+      mysql_bin_log.checksum_alg_reset= (uint8) value;
+    mysql_bin_log.rotate_and_purge(flags);
+  }
+  else
+  {
+    binlog_checksum_options= value;
+  }
+  DBUG_ASSERT((ulong) binlog_checksum_options == value);
+  DBUG_ASSERT(mysql_bin_log.checksum_alg_reset == BINLOG_CHECKSUM_ALG_UNDEF);
+  mysql_mutex_unlock(mysql_bin_log.get_log_lock());
+}
+
+
+static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+  mysql_bin_log.set_status_variables(thd);
+  var->type= SHOW_ARRAY;
+  var->value= (char *)&binlog_status_vars_detail;
+  return 0;
+}
+
+static SHOW_VAR binlog_status_vars_top[]= {
+  {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
+  {NullS, NullS, SHOW_LONG}
+};
+
+static MYSQL_SYSVAR_BOOL(
+  optimize_thread_scheduling,
+  opt_optimize_thread_scheduling,
+  PLUGIN_VAR_READONLY,
+  "Run fast part of group commit in a single thread, to optimize kernel "
+  "thread scheduling. On by default. Disable to run each transaction in group "
+  "commit in its own thread, which can be slower at very high concurrency. "
+  "This option is mostly for testing one algorithm versus the other, and it "
+  "should not normally be necessary to change it.",
+  NULL,
+  NULL,
+  1);
+
+static MYSQL_SYSVAR_ENUM(
+  checksum,
+  binlog_checksum_options,
+  PLUGIN_VAR_RQCMDARG,
+  "Type of BINLOG_CHECKSUM_ALG. Include checksum for "
+  "log events in the binary log. Possible values are NONE and CRC32; "
+  "default is NONE.",
+  NULL,
+  binlog_checksum_update,
+  BINLOG_CHECKSUM_ALG_OFF,
+  &binlog_checksum_typelib);
+
+#ifndef DBUG_OFF
+static MYSQL_SYSVAR_ULONG(
+  dbug_fsync_sleep,
+  opt_binlog_dbug_fsync_sleep,
+  PLUGIN_VAR_RQCMDARG,
+  "Extra sleep (in microseconds) to add to binlog fsync(), for debugging",
+  NULL,
+  NULL,
+  0,
+  0,
+  ULONG_MAX,
+  0);
+#endif
+
+static struct st_mysql_sys_var *binlog_sys_vars[]=
+{
+  MYSQL_SYSVAR(optimize_thread_scheduling),
+  MYSQL_SYSVAR(checksum),
+#ifndef DBUG_OFF
+  MYSQL_SYSVAR(dbug_fsync_sleep),
+#endif
+  NULL
+};
+
+
+/*
+  Copy out the non-directory part of binlog position filename for the
+  `binlog_snapshot_file' status variable, same way as it is done for
+  SHOW MASTER STATUS.
+*/
+static void
+set_binlog_snapshot_file(const char *src)
+{
+  int dir_len = dirname_length(src);
+  strmake(binlog_snapshot_file, src + dir_len, sizeof(binlog_snapshot_file)-1);
+}
+
+/*
+  Copy out current values of status variables, for SHOW STATUS or
+  information_schema.global_status.
+
+  This is called only under LOCK_status, so we can fill in a static array.
+*/
+void
+TC_LOG_BINLOG::set_status_variables(THD *thd)
+{
+  binlog_cache_mngr *cache_mngr;
+
+  if (thd && opt_bin_log)
+    cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
+  else
+    cache_mngr= 0;
+
+  bool have_snapshot= (cache_mngr && cache_mngr->last_commit_pos_file[0] != 0);
+  mysql_mutex_lock(&LOCK_commit_ordered);
+  binlog_status_var_num_commits= this->num_commits;
+  binlog_status_var_num_group_commits= this->num_group_commits;
+  if (!have_snapshot)
+  {
+    set_binlog_snapshot_file(last_commit_pos_file);
+    binlog_snapshot_position= last_commit_pos_offset;
+  }
+  mysql_mutex_unlock(&LOCK_commit_ordered);
+
+  if (have_snapshot)
+  {
+    set_binlog_snapshot_file(cache_mngr->last_commit_pos_file);
+    binlog_snapshot_position= cache_mngr->last_commit_pos_offset;
+  }
+}
+
 struct st_mysql_storage_engine binlog_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
@@ -6680,8 +7653,8 @@ mysql_declare_plugin(binlog)
   binlog_init, /* Plugin Init */
   NULL, /* Plugin Deinit */
   0x0100 /* 1.0 */,
-  NULL,                       /* status variables                */
-  NULL,                       /* system variables                */
+  binlog_status_vars_top,     /* status variables                */
+  binlog_sys_vars,            /* system variables                */
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
@@ -6696,8 +7669,8 @@ maria_declare_plugin(binlog)
   binlog_init, /* Plugin Init */
   NULL, /* Plugin Deinit */
   0x0100 /* 1.0 */,
-  NULL,                       /* status variables                */
-  NULL,                       /* system variables                */
+  binlog_status_vars_top,     /* status variables                */
+  binlog_sys_vars,            /* system variables                */
   "1.0",                      /* string version */
   MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
 }
diff --git a/sql/log.h b/sql/log.h
index 6da45c5f44c..c85e643d4e9 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -44,17 +44,58 @@ class TC_LOG
 
   virtual int open(const char *opt_name)=0;
   virtual void close()=0;
-  virtual int log_xid(THD *thd, my_xid xid)=0;
+  virtual int log_and_order(THD *thd, my_xid xid, bool all,
+                            bool need_prepare_ordered,
+                            bool need_commit_ordered) = 0;
   virtual int unlog(ulong cookie, my_xid xid)=0;
+
+protected:
+  /*
+    These methods are meant to be invoked from log_and_order() implementations
+    to run any prepare_ordered() respectively commit_ordered() methods in
+    participating handlers.
+
+    They must be called using suitable thread syncronisation to ensure that
+    they are each called in the correct commit order among all
+    transactions. However, it is only necessary to call them if the
+    corresponding flag passed to log_and_order is set (it is safe, but not
+    required, to call them when the flag is false).
+
+    The caller must be holding LOCK_prepare_ordered respectively
+    LOCK_commit_ordered when calling these methods.
+  */
+  void run_prepare_ordered(THD *thd, bool all);
+  void run_commit_ordered(THD *thd, bool all);
 };
 
+/*
+  Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
+  and TC_LOG::run_commit_ordered(), or any other code that calls handler
+  prepare_ordered() or commit_ordered() methods.
+*/
+extern mysql_mutex_t LOCK_prepare_ordered;
+extern mysql_mutex_t LOCK_commit_ordered;
+#ifdef HAVE_PSI_INTERFACE
+extern PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered;
+#endif
+
 class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
 {
 public:
   TC_LOG_DUMMY() {}
   int open(const char *opt_name)        { return 0; }
   void close()                          { }
-  int log_xid(THD *thd, my_xid xid)         { return 1; }
+  /*
+    TC_LOG_DUMMY is only used when there are <= 1 XA-capable engines, and we
+    only use internal XA during commit when >= 2 XA-capable engines
+    participate.
+  */
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered)
+  {
+    DBUG_ASSERT(0 /* Internal error - TC_LOG_DUMMY::log_and_order() called */);
+    return 1;
+  }
   int unlog(ulong cookie, my_xid xid)  { return 0; }
 };
 
@@ -80,6 +121,13 @@ class TC_LOG_MMAP: public TC_LOG
     mysql_cond_t  cond; // to wait for a sync
   } PAGE;
 
+  /* List of THDs for which to invoke commit_ordered(), in order. */
+  struct commit_entry
+  {
+    struct commit_entry *next;
+    THD *thd;
+  };
+
   char logname[FN_REFLEN];
   File fd;
   my_off_t file_length;
@@ -94,16 +142,38 @@ class TC_LOG_MMAP: public TC_LOG
   */
   mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
   mysql_cond_t COND_pool, COND_active;
+  /*
+    Queue of threads that need to call commit_ordered().
+    Access to this queue must be protected by LOCK_prepare_ordered.
+  */
+  commit_entry *commit_ordered_queue;
+  /*
+    This flag and condition is used to reserve the queue while threads in it
+    each run the commit_ordered() methods one after the other. Only once the
+    last commit_ordered() in the queue is done can we start on a new queue
+    run.
+
+    Since we start this process in the first thread in the queue and finish in
+    the last (and possibly different) thread, we need a condition variable for
+    this (we cannot unlock a mutex in a different thread than the one who
+    locked it).
+
+    The condition is used together with the LOCK_prepare_ordered mutex.
+  */
+  mysql_cond_t COND_queue_busy;
+  my_bool commit_ordered_queue_busy;
 
   public:
   TC_LOG_MMAP(): inited(0) {}
   int open(const char *opt_name);
   void close();
-  int log_xid(THD *thd, my_xid xid);
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
   int unlog(ulong cookie, my_xid xid);
   int recover();
 
   private:
+  int log_one_transaction(my_xid xid);
   void get_active_from_pool();
   int sync();
   int overflow();
@@ -248,7 +318,7 @@ public:
              uint user_host_len, int thread_id,
              const char *command_type, uint command_type_len,
              const char *sql_text, uint sql_text_len);
-  bool write(THD *thd, time_t current_time, time_t query_start_arg,
+  bool write(THD *thd, time_t current_time,
              const char *user_host, uint user_host_len,
              ulonglong query_utime, ulonglong lock_utime, bool is_command,
              const char *sql_text, uint sql_text_len);
@@ -277,6 +347,7 @@ private:
   time_t last_time;
 };
 
+class binlog_cache_mngr;
 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
 {
  private:
@@ -289,7 +360,33 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
   PSI_file_key m_key_file_log;
   /** The instrumentation key to use for opening the log index file. */
   PSI_file_key m_key_file_log_index;
+
+  PSI_file_key m_key_COND_queue_busy;
 #endif
+
+  struct group_commit_entry
+  {
+    struct group_commit_entry *next;
+    THD *thd;
+    binlog_cache_mngr *cache_mngr;
+    bool using_stmt_cache;
+    bool using_trx_cache;
+    /*
+      Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
+      written during group commit. The incident_event is only valid if
+      trx_data->has_incident() is true.
+    */
+    Log_event *begin_event;
+    Log_event *end_event;
+    Log_event *incident_event;
+    /* Set during group commit to record any per-thread error. */
+    int error;
+    int commit_errno;
+    IO_CACHE *error_cache;
+    /* This is the `all' parameter for ha_commit_ordered(). */
+    bool all;
+  };
+
   /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
   mysql_mutex_t LOCK_index;
   mysql_mutex_t LOCK_prep_xids;
@@ -331,6 +428,20 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
     In 5.0 it's 0 for relay logs too!
   */
   bool no_auto_events;
+  /* Queue of transactions queued up to participate in group commit. */
+  group_commit_entry *group_commit_queue;
+  /*
+    Condition variable to mark that the group commit queue is busy.
+    Used when each thread does it's own commit_ordered() (when
+    binlog_optimize_thread_scheduling=1).
+    Used with the LOCK_commit_ordered mutex.
+  */
+  my_bool group_commit_queue_busy;
+  mysql_cond_t COND_queue_busy;
+  /* Total number of committed transactions. */
+  ulonglong num_commits;
+  /* Number of group commits done. */
+  ulonglong num_group_commits;
 
   /* pointer to the sync period variable, for binlog this will be
      sync_binlog_period, for relay log this will be
@@ -352,6 +463,11 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
   */
   int new_file_without_locking();
   int new_file_impl(bool need_lock);
+  int write_transaction_or_stmt(group_commit_entry *entry);
+  bool write_transaction_to_binlog_events(group_commit_entry *entry);
+  void trx_group_commit_leader(group_commit_entry *leader);
+  void mark_xid_done();
+  void mark_xids_active(uint xid_count);
 
 public:
   MYSQL_LOG::generate_name;
@@ -360,6 +476,41 @@ public:
   /* This is relay log */
   bool is_relay_log;
   ulong signal_cnt;  // update of the counter is checked by heartbeat
+  uint8 checksum_alg_reset; // to contain a new value when binlog is rotated
+  /*
+    Holds the last seen in Relay-Log FD's checksum alg value.
+    The initial value comes from the slave's local FD that heads
+    the very first Relay-Log file. In the following the value may change
+    with each received master's FD_m.
+    Besides to be used in verification events that IO thread receives
+    (except the 1st fake Rotate, see @c Master_info:: checksum_alg_before_fd), 
+    the value specifies if/how to compute checksum for slave's local events
+    and the first fake Rotate (R_f^1) coming from the master.
+    R_f^1 needs logging checksum-compatibly with the RL's heading FD_s.
+
+    Legends for the checksum related comments:
+
+    FD     - Format-Description event,
+    R      - Rotate event
+    R_f    - the fake Rotate event
+    E      - an arbirary event
+
+    The underscore indexes for any event
+    `_s'   indicates the event is generated by Slave
+    `_m'   - by Master
+
+    Two special underscore indexes of FD:
+    FD_q   - Format Description event for queuing   (relay-logging)
+    FD_e   - Format Description event for executing (relay-logging)
+
+    Upper indexes:
+    E^n    - n:th event is a sequence
+
+    RL     - Relay Log
+    (A)    - checksum algorithm descriptor value
+    FD.(A) - the value of (A) in FD
+  */
+  uint8 relay_log_checksum_alg;
   /*
     These describe the log's format. This is used only for relay logs.
     _for_exec is used by the SQL thread, _for_queue by the I/O thread. It's
@@ -370,6 +521,12 @@ public:
   */
   Format_description_log_event *description_event_for_exec,
     *description_event_for_queue;
+  /*
+    Binlog position of last commit (or non-transactional write) to the binlog.
+    Access to this is protected by LOCK_commit_ordered.
+  */
+  char last_commit_pos_file[FN_REFLEN];
+  my_off_t last_commit_pos_offset;
 
   MYSQL_BIN_LOG(uint *sync_period);
   /*
@@ -393,7 +550,8 @@ public:
 
   int open(const char *opt_name);
   void close();
-  int log_xid(THD *thd, my_xid xid);
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
   int unlog(ulong cookie, my_xid xid);
   int recover(IO_CACHE *log, Format_description_log_event *fdle);
 #if !defined(MYSQL_CLIENT)
@@ -439,11 +597,15 @@ public:
   /* Use this to start writing a new log file */
   int new_file();
 
-  bool write(Log_event* event_info); // binary log write
-  bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
-  bool write_incident(THD *thd, bool lock);
-  int  write_cache(THD *thd, IO_CACHE *cache, bool lock_log,
-                   bool flush_and_sync);
+  bool write(Log_event* event_info,
+             my_bool *with_annotate= 0); // binary log write
+  bool write_transaction_to_binlog(THD *thd, binlog_cache_mngr *cache_mngr,
+                                   Log_event *end_ev, bool all,
+                                   bool using_stmt_cache, bool using_trx_cache);
+
+  bool write_incident_already_locked(THD *thd);
+  bool write_incident(THD *thd);
+  int  write_cache(THD *thd, IO_CACHE *cache);
   void set_write_error(THD *thd, bool is_transactional);
   bool check_write_error(THD *thd);
 
@@ -512,6 +674,7 @@ public:
   inline void unlock_index() { mysql_mutex_unlock(&LOCK_index);}
   inline IO_CACHE *get_index_file() { return &index_file;}
   inline uint32 get_open_count() { return open_count; }
+  void set_status_variables(THD *thd);
 };
 
 class Log_event_handler
@@ -521,14 +684,14 @@ public:
   virtual bool init()= 0;
   virtual void cleanup()= 0;
 
-  virtual bool log_slow(THD *thd, time_t current_time,
-                        time_t query_start_arg, const char *user_host,
+  virtual bool log_slow(THD *thd, my_hrtime_t current_time,
+                        const char *user_host,
                         uint user_host_len, ulonglong query_utime,
                         ulonglong lock_utime, bool is_command,
                         const char *sql_text, uint sql_text_len)= 0;
   virtual bool log_error(enum loglevel level, const char *format,
                          va_list args)= 0;
-  virtual bool log_general(THD *thd, time_t event_time, const char *user_host,
+  virtual bool log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
                            uint user_host_len, int thread_id,
                            const char *command_type, uint command_type_len,
                            const char *sql_text, uint sql_text_len,
@@ -550,14 +713,14 @@ public:
   virtual bool init();
   virtual void cleanup();
 
-  virtual bool log_slow(THD *thd, time_t current_time,
-                        time_t query_start_arg, const char *user_host,
+  virtual bool log_slow(THD *thd, my_hrtime_t current_time,
+                        const char *user_host,
                         uint user_host_len, ulonglong query_utime,
                         ulonglong lock_utime, bool is_command,
                         const char *sql_text, uint sql_text_len);
   virtual bool log_error(enum loglevel level, const char *format,
                          va_list args);
-  virtual bool log_general(THD *thd, time_t event_time, const char *user_host,
+  virtual bool log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
                            uint user_host_len, int thread_id,
                            const char *command_type, uint command_type_len,
                            const char *sql_text, uint sql_text_len,
@@ -582,14 +745,14 @@ public:
   virtual bool init();
   virtual void cleanup();
 
-  virtual bool log_slow(THD *thd, time_t current_time,
-                        time_t query_start_arg, const char *user_host,
+  virtual bool log_slow(THD *thd, my_hrtime_t current_time,
+                        const char *user_host,
                         uint user_host_len, ulonglong query_utime,
                         ulonglong lock_utime, bool is_command,
                         const char *sql_text, uint sql_text_len);
   virtual bool log_error(enum loglevel level, const char *format,
                          va_list args);
-  virtual bool log_general(THD *thd, time_t event_time, const char *user_host,
+  virtual bool log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
                            uint user_host_len, int thread_id,
                            const char *command_type, uint command_type_len,
                            const char *sql_text, uint sql_text_len,
@@ -714,7 +877,7 @@ bool flush_error_log();
 File open_binlog(IO_CACHE *log, const char *log_file_name,
                  const char **errmsg);
 
-char *make_log_name(char *buff, const char *name, const char* log_ext);
+void make_default_log_name(char **out, const char* log_ext, bool once);
 
 extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log;
 extern LOGGER logger;
diff --git a/sql/log_event.cc b/sql/log_event.cc
index 70087ed4da3..49383778b58 100644
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -51,6 +51,31 @@
 #include <my_bitmap.h>
 #include "rpl_utility.h"
 
+
+/**
+  BINLOG_CHECKSUM variable.
+*/
+const char *binlog_checksum_type_names[]= {
+  "NONE",
+  "CRC32",
+  NullS
+};
+
+unsigned int binlog_checksum_type_length[]= {
+  sizeof("NONE") - 1,
+  sizeof("CRC32") - 1,
+  0
+};
+
+TYPELIB binlog_checksum_typelib=
+{
+  array_elements(binlog_checksum_type_names) - 1, "",
+  binlog_checksum_type_names,
+  binlog_checksum_type_length
+};
+
+
+
 #define log_cs	&my_charset_latin1
 
 #define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
@@ -64,6 +89,24 @@
 */
 #define FMT_G_BUFSIZE(PREC) (3 + (PREC) + 5 + 1)
 
+/* 
+   replication event checksum is introduced in the following "checksum-home" version.
+   The checksum-aware servers extract FD's version to decide whether the FD event
+   carries checksum info.
+
+   TODO: correct the constant when it has been determined 
+   (which main tree to push and when) 
+*/
+const uchar checksum_version_split_mysql[3]= {5, 6, 1};
+const ulong checksum_version_product_mysql=
+  (checksum_version_split_mysql[0] * 256 +
+   checksum_version_split_mysql[1]) * 256 +
+  checksum_version_split_mysql[2];
+const uchar checksum_version_split_mariadb[3]= {5, 3, 0};
+const ulong checksum_version_product_mariadb=
+  (checksum_version_split_mariadb[0] * 256 +
+   checksum_version_split_mariadb[1]) * 256 +
+  checksum_version_split_mariadb[2];
 
 #if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
 static int rows_event_stmt_cleanup(Relay_log_info const *rli, THD* thd);
@@ -587,7 +630,7 @@ append_query_string(CHARSET_INFO *csinfo,
   if (to->reserve(orig_len + from->length()*2+3))
     return 1;
 
-  beg= to->c_ptr_quick() + to->length();
+  beg= (char*) to->ptr() + to->length();
   ptr= beg;
   if (csinfo->escape_with_backslash_is_dangerous)
     ptr= str_to_hex(ptr, from->ptr(), from->length());
@@ -624,7 +667,6 @@ static void print_set_option(IO_CACHE* file, uint32 bits_changed,
   }
 }
 #endif
-
 /**************************************************************************
 	Log_event methods (= the parent class of all events)
 **************************************************************************/
@@ -663,6 +705,7 @@ const char* Log_event::get_type_str(Log_event_type type)
   case BEGIN_LOAD_QUERY_EVENT: return "Begin_load_query";
   case EXECUTE_LOAD_QUERY_EVENT: return "Execute_load_query";
   case INCIDENT_EVENT: return "Incident";
+  case ANNOTATE_ROWS_EVENT: return "Annotate_rows";
   default: return "Unknown";				/* impossible */
   }
 }
@@ -680,10 +723,12 @@ const char* Log_event::get_type_str()
 #ifndef MYSQL_CLIENT
 Log_event::Log_event(THD* thd_arg, uint16 flags_arg, bool using_trans)
   :log_pos(0), temp_buf(0), exec_time(0), flags(flags_arg),
-  cache_type(Log_event::EVENT_INVALID_CACHE), thd(thd_arg)
+   crc(0), thd(thd_arg),
+   checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF)
 {
   server_id=	thd->server_id;
-  when=		thd->start_time;
+  when=         thd->start_time;
+  when_sec_part=thd->start_time_sec_part;
 
   if (using_trans)
     cache_type= Log_event::EVENT_TRANSACTIONAL_CACHE;
@@ -700,14 +745,16 @@ Log_event::Log_event(THD* thd_arg, uint16 flags_arg, bool using_trans)
 
 Log_event::Log_event()
   :temp_buf(0), exec_time(0), flags(0),
-  cache_type(Log_event::EVENT_INVALID_CACHE), thd(0)
+   cache_type(Log_event::EVENT_INVALID_CACHE), crc(0),
+   thd(0), checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF)
 {
   server_id=	::server_id;
   /*
     We can't call my_time() here as this would cause a call before
     my_init() is called
   */
-  when=		0;
+  when=         0;
+  when_sec_part=0;
   log_pos=	0;
 }
 #endif /* !MYSQL_CLIENT */
@@ -719,12 +766,14 @@ Log_event::Log_event()
 
 Log_event::Log_event(const char* buf,
                      const Format_description_log_event* description_event)
-  :temp_buf(0), cache_type(Log_event::EVENT_INVALID_CACHE)
+  :temp_buf(0), cache_type(Log_event::EVENT_INVALID_CACHE),
+    crc(0), checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF)
 {
 #ifndef MYSQL_CLIENT
   thd = 0;
 #endif
   when = uint4korr(buf);
+  when_sec_part= 0;
   server_id = uint4korr(buf + SERVER_ID_OFFSET);
   data_written= uint4korr(buf + EVENT_LEN_OFFSET);
   if (description_event->binlog_version==1)
@@ -746,7 +795,7 @@ Log_event::Log_event(const char* buf,
     logs are in 4.0 format, until it finds a Format_desc).
   */
   if (description_event->binlog_version==3 &&
-      buf[EVENT_TYPE_OFFSET]<FORMAT_DESCRIPTION_EVENT && log_pos)
+      (uchar)buf[EVENT_TYPE_OFFSET]<FORMAT_DESCRIPTION_EVENT && log_pos)
   {
       /*
         If log_pos=0, don't change it. log_pos==0 is a marker to mean
@@ -764,8 +813,8 @@ Log_event::Log_event(const char* buf,
   DBUG_PRINT("info", ("log_pos: %lu", (ulong) log_pos));
 
   flags= uint2korr(buf + FLAGS_OFFSET);
-  if ((buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) ||
-      (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT))
+  if (((uchar)buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) ||
+      ((uchar)buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT))
   {
     /*
       These events always have a header which stops here (i.e. their
@@ -813,21 +862,13 @@ int Log_event::do_update_pos(Relay_log_info *rli)
     DBUG_EXECUTE_IF("let_first_flush_log_change_timestamp",
                     if (debug_not_change_ts_if_art_event == 1
                         && is_artificial_event())
-                    {
-                      debug_not_change_ts_if_art_event= 0;
-                    });
-#ifndef DBUG_OFF
-    rli->stmt_done(log_pos, 
-                   is_artificial_event() &&
-                   debug_not_change_ts_if_art_event > 0 ? 0 : when);
-#else
-    rli->stmt_done(log_pos, is_artificial_event()? 0 : when);
-#endif
+                      debug_not_change_ts_if_art_event= 0; );
+    rli->stmt_done(log_pos, is_artificial_event() &&
+                   IF_DBUG(debug_not_change_ts_if_art_event > 0, 1) ?
+                     0 : when);
     DBUG_EXECUTE_IF("let_first_flush_log_change_timestamp",
                     if (debug_not_change_ts_if_art_event == 0)
-                    {
-                      debug_not_change_ts_if_art_event= 2;
-                    });
+                      debug_not_change_ts_if_art_event= 2; );
   }
   return 0;                                   // Cannot fail currently
 }
@@ -904,6 +945,105 @@ void Log_event::init_show_field_list(List<Item>* field_list)
   field_list->push_back(new Item_empty_string("Info", 20));
 }
 
+/**
+   A decider of whether to trigger checksum computation or not.
+   To be invoked in Log_event::write() stack.
+   The decision is positive 
+
+    S,M) if it's been marked for checksumming with @c checksum_alg
+    
+    M) otherwise, if @@global.binlog_checksum is not NONE and the event is 
+       directly written to the binlog file.
+       The to-be-cached event decides at @c write_cache() time.
+
+   Otherwise the decision is negative.
+
+   @note   A side effect of the method is altering Log_event::checksum_alg
+           it the latter was undefined at calling.
+
+   @return true (positive) or false (negative)
+*/
+my_bool Log_event::need_checksum()
+{
+  DBUG_ENTER("Log_event::need_checksum");
+  my_bool ret;
+  /* 
+     few callers of Log_event::write 
+     (incl FD::write, FD constructing code on the slave side, Rotate relay log
+     and Stop event) 
+     provides their checksum alg preference through Log_event::checksum_alg.
+  */
+  ret= ((checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF) ?
+        (checksum_alg != BINLOG_CHECKSUM_ALG_OFF) :
+        ((binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF) &&
+         (cache_type == Log_event::EVENT_NO_CACHE)) ?
+        test(binlog_checksum_options) : FALSE);
+
+  /*
+    FD calls the methods before data_written has been calculated.
+    The following invariant claims if the current is not the first
+    call (and therefore data_written is not zero) then `ret' must be
+    TRUE. It may not be null because FD is always checksummed.
+  */
+  
+  DBUG_ASSERT(get_type_code() != FORMAT_DESCRIPTION_EVENT || ret ||
+              data_written == 0);
+
+  if (checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF)
+    checksum_alg= ret ? // calculated value stored
+      (uint8) binlog_checksum_options : (uint8) BINLOG_CHECKSUM_ALG_OFF;
+
+  DBUG_ASSERT(!ret || 
+              ((checksum_alg == binlog_checksum_options ||
+               /* 
+                  Stop event closes the relay-log and its checksum alg
+                  preference is set by the caller can be different
+                  from the server's binlog_checksum_options.
+               */
+               get_type_code() == STOP_EVENT ||
+               /* 
+                  Rotate:s can be checksummed regardless of the server's
+                  binlog_checksum_options. That applies to both
+                  the local RL's Rotate and the master's Rotate
+                  which IO thread instantiates via queue_binlog_ver_3_event.
+               */
+               get_type_code() == ROTATE_EVENT
+               ||  /* FD is always checksummed */
+               get_type_code() == FORMAT_DESCRIPTION_EVENT) && 
+               checksum_alg != BINLOG_CHECKSUM_ALG_OFF));
+
+  DBUG_ASSERT(checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+
+  DBUG_ASSERT(((get_type_code() != ROTATE_EVENT &&
+                get_type_code() != STOP_EVENT) ||
+               get_type_code() != FORMAT_DESCRIPTION_EVENT) ||
+              cache_type == Log_event::EVENT_NO_CACHE);
+
+  DBUG_RETURN(ret);
+}
+
+bool Log_event::wrapper_my_b_safe_write(IO_CACHE* file, const uchar* buf, ulong size)
+{
+  if (need_checksum() && size != 0)
+    crc= my_checksum(crc, buf, size);
+
+  return my_b_safe_write(file, buf, size);
+}
+
+bool Log_event::write_footer(IO_CACHE* file) 
+{
+  /*
+     footer contains the checksum-algorithm descriptor 
+     followed by the checksum value
+  */
+  if (need_checksum())
+  {
+    uchar buf[BINLOG_CHECKSUM_LEN];
+    int4store(buf, crc);
+    return (my_b_safe_write(file, (uchar*) buf, sizeof(buf)));
+  }
+  return 0;
+}
 
 /*
   Log_event::write()
@@ -913,11 +1053,18 @@ bool Log_event::write_header(IO_CACHE* file, ulong event_data_length)
 {
   uchar header[LOG_EVENT_HEADER_LEN];
   ulong now;
+  bool ret;
   DBUG_ENTER("Log_event::write_header");
 
   /* Store number of bytes that will be written by this event */
   data_written= event_data_length + sizeof(header);
 
+  if (need_checksum())
+  {
+    crc= my_checksum(0L, NULL, 0);
+    data_written += BINLOG_CHECKSUM_LEN;
+  }
+
   /*
     log_pos != 0 if this is relay-log event. In this case we should not
     change the position
@@ -962,7 +1109,7 @@ bool Log_event::write_header(IO_CACHE* file, ulong event_data_length)
     log_pos= my_b_safe_tell(file)+data_written;
   }
 
-  now= (ulong) get_time();                              // Query start time
+  now= get_time();                               // Query start time
 
   /*
     Header will be of size LOG_EVENT_HEADER_LEN for all events, except for
@@ -976,9 +1123,36 @@ bool Log_event::write_header(IO_CACHE* file, ulong event_data_length)
   int4store(header+ SERVER_ID_OFFSET, server_id);
   int4store(header+ EVENT_LEN_OFFSET, data_written);
   int4store(header+ LOG_POS_OFFSET, log_pos);
-  int2store(header+ FLAGS_OFFSET, flags);
-
-  DBUG_RETURN(my_b_safe_write(file, header, sizeof(header)) != 0);
+  /*
+    recording checksum of FD event computed with dropped
+    possibly active LOG_EVENT_BINLOG_IN_USE_F flag.
+    Similar step at verication: the active flag is dropped before
+    checksum computing.
+  */
+  if (header[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT ||
+      !need_checksum() || !(flags & LOG_EVENT_BINLOG_IN_USE_F))
+  {
+    int2store(header+ FLAGS_OFFSET, flags);
+    ret= wrapper_my_b_safe_write(file, header, sizeof(header)) != 0;
+  }
+  else
+  {
+    ret= (wrapper_my_b_safe_write(file, header, FLAGS_OFFSET) != 0);
+    if (!ret)
+    {
+      flags &= ~LOG_EVENT_BINLOG_IN_USE_F;
+      int2store(header + FLAGS_OFFSET, flags);
+      crc= my_checksum(crc, header + FLAGS_OFFSET, sizeof(flags));
+      flags |= LOG_EVENT_BINLOG_IN_USE_F;    
+      int2store(header + FLAGS_OFFSET, flags);
+      ret= (my_b_safe_write(file, header + FLAGS_OFFSET, sizeof(flags)) != 0);
+    }
+    if (!ret)
+      ret= (wrapper_my_b_safe_write(file, header + FLAGS_OFFSET + sizeof(flags),
+                                    sizeof(header)
+                                    - (FLAGS_OFFSET + sizeof(flags))) != 0);
+  }
+  DBUG_RETURN( ret);
 }
 
 
@@ -988,11 +1162,13 @@ bool Log_event::write_header(IO_CACHE* file, ulong event_data_length)
 */
 
 int Log_event::read_log_event(IO_CACHE* file, String* packet,
-                              mysql_mutex_t* log_lock)
+                              mysql_mutex_t* log_lock,
+                              uint8 checksum_alg_arg)
 {
   ulong data_len;
   int result=0;
   char buf[LOG_EVENT_MINIMAL_HEADER_LEN];
+  uchar ev_offset= packet->length();
   DBUG_ENTER("Log_event::read_log_event");
 
   if (log_lock)
@@ -1050,6 +1226,31 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet,
                (file->error >= 0 ? LOG_READ_TRUNC: LOG_READ_IO));
       /* Implicit goto end; */
     }
+    else
+    {
+      /* Corrupt the event for Dump thread*/
+      DBUG_EXECUTE_IF("corrupt_read_log_event2",
+	uchar *debug_event_buf_c = (uchar*) packet->ptr() + ev_offset;
+        if (debug_event_buf_c[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT)
+        {
+          int debug_cor_pos = rand() % (data_len + sizeof(buf) - BINLOG_CHECKSUM_LEN);
+          debug_event_buf_c[debug_cor_pos] =~ debug_event_buf_c[debug_cor_pos];
+          DBUG_PRINT("info", ("Corrupt the event at Log_event::read_log_event: byte on position %d", debug_cor_pos));
+          DBUG_SET("-d,corrupt_read_log_event2");
+	}
+      );                                                                                           
+      /*
+        CRC verification of the Dump thread
+      */
+      if (opt_master_verify_checksum &&
+          event_checksum_test((uchar*) packet->ptr() + ev_offset,
+                              data_len + sizeof(buf),
+                              checksum_alg_arg))
+      {
+        result= LOG_READ_CHECKSUM_FAILURE;
+        goto end;
+      }
+    }
   }
 
 end:
@@ -1075,11 +1276,13 @@ end:
 Log_event* Log_event::read_log_event(IO_CACHE* file,
                                      mysql_mutex_t* log_lock,
                                      const Format_description_log_event
-                                     *description_event)
+                                     *description_event,
+                                     my_bool crc_check)
 #else
 Log_event* Log_event::read_log_event(IO_CACHE* file,
                                      const Format_description_log_event
-                                     *description_event)
+                                     *description_event,
+                                     my_bool crc_check)
 #endif
 {
   DBUG_ENTER("Log_event::read_log_event");
@@ -1143,7 +1346,7 @@ failed my_b_read"));
     error = "read error";
     goto err;
   }
-  if ((res= read_log_event(buf, data_len, &error, description_event)))
+  if ((res= read_log_event(buf, data_len, &error, description_event, crc_check)))
     res->register_temp_buf(buf, TRUE);
 
 err:
@@ -1176,9 +1379,11 @@ err:
 
 Log_event* Log_event::read_log_event(const char* buf, uint event_len,
 				     const char **error,
-                                     const Format_description_log_event *description_event)
+                                     const Format_description_log_event *description_event,
+                                     my_bool crc_check)
 {
   Log_event* ev;
+  uint8 alg;
   DBUG_ENTER("Log_event::read_log_event(char*,...)");
   DBUG_ASSERT(description_event != 0);
   DBUG_PRINT("info", ("binlog_version: %d", description_event->binlog_version));
@@ -1186,14 +1391,68 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len,
 
   /* Check the integrity */
   if (event_len < EVENT_LEN_OFFSET ||
-      buf[EVENT_TYPE_OFFSET] >= ENUM_END_EVENT ||
+      (uchar)buf[EVENT_TYPE_OFFSET] >= ENUM_END_EVENT ||
       (uint) event_len != uint4korr(buf+EVENT_LEN_OFFSET))
   {
     *error="Sanity check failed";		// Needed to free buffer
     DBUG_RETURN(NULL); // general sanity check - will fail on a partial read
   }
 
-  uint event_type= buf[EVENT_TYPE_OFFSET];
+  uint event_type= (uchar)buf[EVENT_TYPE_OFFSET];
+  // all following START events in the current file are without checksum
+  if (event_type == START_EVENT_V3)
+    (const_cast< Format_description_log_event *>(description_event))->checksum_alg= BINLOG_CHECKSUM_ALG_OFF;
+  /*
+    CRC verification by SQL and Show-Binlog-Events master side.
+    The caller has to provide @description_event->checksum_alg to
+    be the last seen FD's (A) descriptor.
+    If event is FD the descriptor is in it.
+    Notice, FD of the binlog can be only in one instance and therefore
+    Show-Binlog-Events executing master side thread needs just to know
+    the only FD's (A) value -  whereas RL can contain more.
+    In the RL case, the alg is kept in FD_e (@description_event) which is reset 
+    to the newer read-out event after its execution with possibly new alg descriptor.
+    Therefore in a typical sequence of RL:
+    {FD_s^0, FD_m, E_m^1} E_m^1 
+    will be verified with (A) of FD_m.
+
+    See legends definition on MYSQL_BIN_LOG::relay_log_checksum_alg docs
+    lines (log.h).
+
+    Notice, a pre-checksum FD version forces alg := BINLOG_CHECKSUM_ALG_UNDEF.
+  */
+  alg= (event_type != FORMAT_DESCRIPTION_EVENT) ?
+    description_event->checksum_alg : get_checksum_alg(buf, event_len);
+  // Emulate the corruption during reading an event
+  DBUG_EXECUTE_IF("corrupt_read_log_event_char",
+    if (event_type != FORMAT_DESCRIPTION_EVENT)
+    {
+      char *debug_event_buf_c = (char *)buf;
+      int debug_cor_pos = rand() % (event_len - BINLOG_CHECKSUM_LEN);
+      debug_event_buf_c[debug_cor_pos] =~ debug_event_buf_c[debug_cor_pos];
+      DBUG_PRINT("info", ("Corrupt the event at Log_event::read_log_event(char*,...): byte on position %d", debug_cor_pos));
+      DBUG_SET("-d,corrupt_read_log_event_char");
+    }
+  );                                                 
+  if (crc_check &&
+      event_checksum_test((uchar *) buf, event_len, alg))
+  {
+#ifdef MYSQL_CLIENT
+    *error= "Event crc check failed! Most likely there is event corruption.";
+    if (force_opt)
+    {
+      ev= new Unknown_log_event(buf, description_event);
+      DBUG_RETURN(ev);
+    }
+    else
+      DBUG_RETURN(NULL);
+#else
+    *error= ER(ER_BINLOG_READ_EVENT_CHECKSUM_FAILURE);
+    sql_print_error("%s", ER(ER_BINLOG_READ_EVENT_CHECKSUM_FAILURE));
+    DBUG_RETURN(NULL);
+#endif
+  }
+
   if (event_type > description_event->number_of_event_types &&
       event_type != FORMAT_DESCRIPTION_EVENT)
   {
@@ -1228,6 +1487,11 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len,
       event_type= new_event_type;
     }
 
+    if (alg != BINLOG_CHECKSUM_ALG_UNDEF &&
+        (event_type == FORMAT_DESCRIPTION_EVENT ||
+         alg != BINLOG_CHECKSUM_ALG_OFF))
+      event_len= event_len - BINLOG_CHECKSUM_LEN;
+    
     switch(event_type) {
     case QUERY_EVENT:
       ev  = new Query_log_event(buf, event_len, description_event, QUERY_EVENT);
@@ -1311,6 +1575,9 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len,
     case INCIDENT_EVENT:
       ev = new Incident_log_event(buf, event_len, description_event);
       break;
+    case ANNOTATE_ROWS_EVENT:
+      ev = new Annotate_rows_log_event(buf, event_len, description_event);
+      break;
     default:
       DBUG_PRINT("error",("Unknown event code: %d",
                           (int) buf[EVENT_TYPE_OFFSET]));
@@ -1319,6 +1586,14 @@ Log_event* Log_event::read_log_event(const char* buf, uint event_len,
     }
   }
 
+  if (ev)
+  {
+    ev->checksum_alg= alg;
+    if (ev->checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+        ev->checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF)
+      ev->crc= uint4korr(buf + (event_len));
+  }
+
   DBUG_PRINT("read_event", ("%s(type_code: %d; event_len: %d)",
                             ev ? ev->get_type_str() : "<unknown>",
                             buf[EVENT_TYPE_OFFSET],
@@ -1373,6 +1648,18 @@ void Log_event::print_header(IO_CACHE* file,
   my_b_printf(file, " server id %lu  end_log_pos %s ", (ulong) server_id,
               llstr(log_pos,llbuff));
 
+  /* print the checksum */
+
+  if (checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+      checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF)
+  {
+    char checksum_buf[BINLOG_CHECKSUM_LEN * 2 + 4]; // to fit to "0x%lx "
+    size_t const bytes_written=
+      my_snprintf(checksum_buf, sizeof(checksum_buf), "0x%08lx ", (ulong) crc);
+    my_b_printf(file, "%s ", get_type(&binlog_checksum_typelib, checksum_alg));
+    my_b_printf(file, checksum_buf, bytes_written);
+  }
+
   /* mysqlbinlog --hexdump */
   if (print_event_info->hexdump_from)
   {
@@ -1718,6 +2005,7 @@ log_event_print_value(IO_CACHE *file, const uchar *ptr,
       uint64 i64= uint8korr(ptr); /* YYYYMMDDhhmmss */
       d= (ulong) (i64 / 1000000);
       t= (ulong) (i64 % 1000000);
+
       my_b_printf(file, "%04d-%02d-%02d %02d:%02d:%02d",
                   (int) (d / 10000), (int) (d % 10000) / 100, (int) (d % 100),
                   (int) (t / 10000), (int) (t % 10000) / 100, (int) t % 100);
@@ -2000,12 +2288,10 @@ end:
   delete td;
 }
 
-#ifdef MYSQL_CLIENT
 void free_table_map_log_event(Table_map_log_event *event)
 {
   delete event;
 }
-#endif
 
 void Log_event::print_base64(IO_CACHE* file,
                              PRINT_EVENT_INFO* print_event_info,
@@ -2042,6 +2328,9 @@ void Log_event::print_base64(IO_CACHE* file,
   if (print_event_info->verbose)
   {
     Rows_log_event *ev= NULL;
+    if (checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF &&
+        checksum_alg != BINLOG_CHECKSUM_ALG_OFF)
+      size-= BINLOG_CHECKSUM_LEN; // checksum is displayed through the header
     
     if (ptr[4] == TABLE_MAP_EVENT)
     {
@@ -2085,15 +2374,11 @@ void Log_event::print_base64(IO_CACHE* file,
 void Log_event::print_timestamp(IO_CACHE* file, time_t* ts)
 {
   struct tm *res;
+  time_t my_when= when;
   DBUG_ENTER("Log_event::print_timestamp");
   if (!ts)
-    ts = &when;
-#ifdef MYSQL_SERVER				// This is always false
-  struct tm tm_tmp;
-  localtime_r(ts,(res= &tm_tmp));
-#else
+    ts = &my_when;
   res=localtime(ts);
-#endif
 
   my_b_printf(file,"%02d%02d%02d %2d:%02d:%02d",
               res->tm_year % 100,
@@ -2378,6 +2663,14 @@ bool Query_log_event::write(IO_CACHE* file)
       start+= host.length;
     }
   }
+
+  if (thd && thd->query_start_sec_part_used)
+  {
+    *start++= Q_HRNOW;
+    get_time();
+    int3store(start, when_sec_part);
+    start+= 3;
+  }
   /*
     NOTE: When adding new status vars, please don't forget to update
     the MAX_SIZE_LOG_EVENT_STATUS in log_event.h and update the function
@@ -2404,12 +2697,13 @@ bool Query_log_event::write(IO_CACHE* file)
   event_length= (uint) (start-buf) + get_post_header_size_for_derived() + db_len + 1 + q_len;
 
   return (write_header(file, event_length) ||
-          my_b_safe_write(file, (uchar*) buf, QUERY_HEADER_LEN) ||
+          wrapper_my_b_safe_write(file, (uchar*) buf, QUERY_HEADER_LEN) ||
           write_post_header_for_derived(file) ||
-          my_b_safe_write(file, (uchar*) start_of_status,
+          wrapper_my_b_safe_write(file, (uchar*) start_of_status,
                           (uint) (start-start_of_status)) ||
-          my_b_safe_write(file, (db) ? (uchar*) db : (uchar*)"", db_len + 1) ||
-          my_b_safe_write(file, (uchar*) query, q_len)) ? 1 : 0;
+          wrapper_my_b_safe_write(file, (db) ? (uchar*) db : (uchar*)"", db_len + 1) ||
+          wrapper_my_b_safe_write(file, (uchar*) query, q_len) ||
+	  write_footer(file)) ? 1 : 0;
 }
 
 /**
@@ -2469,7 +2763,7 @@ Query_log_event::Query_log_event(THD* thd_arg, const char* query_arg,
 
   error_code= errcode;
 
-  time(&end_time);
+  end_time= my_time(0);
   exec_time = (ulong) (end_time  - thd_arg->start_time);
   /**
     @todo this means that if we have no catalog, then it is replicated
@@ -2661,6 +2955,7 @@ code_name(int code)
   case Q_CHARSET_DATABASE_CODE: return "Q_CHARSET_DATABASE_CODE";
   case Q_TABLE_MAP_FOR_UPDATE_CODE: return "Q_TABLE_MAP_FOR_UPDATE_CODE";
   case Q_MASTER_DATA_WRITTEN_CODE: return "Q_MASTER_DATA_WRITTEN_CODE";
+  case Q_HRNOW: return "Q_HRNOW";
   }
   sprintf(buf, "CODE#%d", code);
   return buf;
@@ -2877,6 +3172,14 @@ Query_log_event::Query_log_event(const char* buf, uint event_len,
       CHECK_SPACE(pos, end, host.length);
       host.str= (char *)pos;
       pos+= host.length;
+      break;
+    }
+    case Q_HRNOW:
+    {
+      CHECK_SPACE(pos, end, 3);
+      when_sec_part= uint3korr(pos);
+      pos+= 3;
+      break;
     }
     default:
       /* That's why you must write status vars in growing order of code */
@@ -2956,7 +3259,7 @@ void Query_log_event::print_query_header(IO_CACHE* file,
 					 PRINT_EVENT_INFO* print_event_info)
 {
   // TODO: print the catalog ??
-  char buff[40],*end;				// Enough for SET TIMESTAMP
+  char buff[64], *end;				// Enough for SET TIMESTAMP
   bool different_db= 1;
   uint32 tmp;
 
@@ -2983,6 +3286,11 @@ void Query_log_event::print_query_header(IO_CACHE* file,
   }
 
   end=int10_to_str((long) when, strmov(buff,"SET TIMESTAMP="),10);
+  if (when_sec_part)
+  {
+    *end++= '.';
+    end=int10_to_str(when_sec_part, end, 10);
+  }
   end= strmov(end, print_event_info->delimiter);
   *end++='\n';
   my_b_write(file, (uchar*) buff, (uint) (end-buff));
@@ -3244,7 +3552,7 @@ int Query_log_event::do_apply_event(Relay_log_info const *rli,
   */
   if (is_trans_keyword() || rpl_filter->db_ok(thd->db))
   {
-    thd->set_time((time_t)when);
+    thd->set_time(when, when_sec_part);
     thd->set_query_and_id((char*)query_arg, q_len_arg,
                           thd->charset(), next_query_id());
     thd->variables.pseudo_thread_id= thread_id;		// for temp tables
@@ -3417,6 +3725,19 @@ START SLAVE; . Query: '%s'", expected_error, thd->query());
     /* If the query was not ignored, it is printed to the general log */
     if (!thd->is_error() || thd->stmt_da->sql_errno() != ER_SLAVE_IGNORED_TABLE)
       general_log_write(thd, COM_QUERY, thd->query(), thd->query_length());
+    else
+    {
+      /*
+        Bug#54201: If we skip an INSERT query that uses auto_increment, then we
+        should reset any @@INSERT_ID set by an Intvar_log_event associated with
+        the query; otherwise the @@INSERT_ID will linger until the next INSERT
+        that uses auto_increment and may affect extra triggers on the slave etc.
+
+        We reset INSERT_ID unconditionally; it is probably cheaper than
+        checking if it is necessary.
+      */
+      thd->auto_inc_intervals_forced.empty();
+    }
 
 compare_errors:
     /*
@@ -3706,10 +4027,11 @@ bool Start_log_event_v3::write(IO_CACHE* file)
   int2store(buff + ST_BINLOG_VER_OFFSET,binlog_version);
   memcpy(buff + ST_SERVER_VER_OFFSET,server_version,ST_SERVER_VER_LEN);
   if (!dont_set_created)
-    created= when= get_time();
+    created= get_time(); // this sets when and when_sec_part as a side effect
   int4store(buff + ST_CREATED_OFFSET,created);
   return (write_header(file, sizeof(buff)) ||
-          my_b_safe_write(file, (uchar*) buff, sizeof(buff)));
+          wrapper_my_b_safe_write(file, (uchar*) buff, sizeof(buff)) ||
+	  write_footer(file));
 }
 #endif
 
@@ -3812,6 +4134,7 @@ int Start_log_event_v3::do_apply_event(Relay_log_info const *rli)
                                 old 4.0 (binlog version 2) is not supported;
                                 it should not be used for replication with
                                 5.0.
+  @param server_ver             a string containing the server version.
 */
 
 Format_description_log_event::
@@ -3827,9 +4150,9 @@ Format_description_log_event(uint8 binlog_ver, const char* server_ver)
     common_header_len= LOG_EVENT_HEADER_LEN;
     number_of_event_types= LOG_EVENT_TYPES;
     /* we'll catch my_malloc() error in is_valid() */
-    post_header_len=(uint8*) my_malloc(number_of_event_types*sizeof(uint8),
+    post_header_len=(uint8*) my_malloc(number_of_event_types*sizeof(uint8)
+                                       + BINLOG_CHECKSUM_ALG_DESC_LEN,
                                        MYF(0));
-
     /*
       This long list of assignments is not beautiful, but I see no way to
       make it nicer, as the right members are #defines, not array members, so
@@ -3894,6 +4217,13 @@ Format_description_log_event(uint8 binlog_ver, const char* server_ver)
       post_header_len[INCIDENT_EVENT-1]= INCIDENT_HEADER_LEN;
       post_header_len[HEARTBEAT_LOG_EVENT-1]= 0;
 
+      // Set header length of the reserved events to 0
+      memset(post_header_len + MYSQL_EVENTS_END - 1, 0,
+             (MARIA_EVENTS_BEGIN - MYSQL_EVENTS_END)*sizeof(uint8));
+
+      // Set header lengths of Maria events
+      post_header_len[ANNOTATE_ROWS_EVENT-1]= ANNOTATE_ROWS_HEADER_LEN;
+
       // Sanity-check that all post header lengths are initialized.
       int i;
       for (i=0; i<number_of_event_types; i++)
@@ -3946,6 +4276,7 @@ Format_description_log_event(uint8 binlog_ver, const char* server_ver)
     break;
   }
   calc_server_version_split();
+  checksum_alg= (uint8) BINLOG_CHECKSUM_ALG_UNDEF;
 }
 
 
@@ -3980,14 +4311,26 @@ Format_description_log_event(const char* buf,
   if ((common_header_len=buf[ST_COMMON_HEADER_LEN_OFFSET]) < OLD_HEADER_LEN)
     DBUG_VOID_RETURN; /* sanity check */
   number_of_event_types=
-    event_len-(LOG_EVENT_MINIMAL_HEADER_LEN+ST_COMMON_HEADER_LEN_OFFSET+1);
+    event_len - (LOG_EVENT_MINIMAL_HEADER_LEN + ST_COMMON_HEADER_LEN_OFFSET + 1);
   DBUG_PRINT("info", ("common_header_len=%d number_of_event_types=%d",
                       common_header_len, number_of_event_types));
   /* If alloc fails, we'll detect it in is_valid() */
+
   post_header_len= (uint8*) my_memdup((uchar*)buf+ST_COMMON_HEADER_LEN_OFFSET+1,
                                       number_of_event_types*
-                                      sizeof(*post_header_len), MYF(0));
+                                      sizeof(*post_header_len),
+                                      MYF(0));
   calc_server_version_split();
+  if (!is_version_before_checksum(&server_version_split))
+  {
+    /* the last bytes are the checksum alg desc and value (or value's room) */
+    number_of_event_types -= BINLOG_CHECKSUM_ALG_DESC_LEN;
+    checksum_alg= post_header_len[number_of_event_types];
+  }
+  else
+  {
+    checksum_alg= (uint8) BINLOG_CHECKSUM_ALG_UNDEF;
+  }
 
   /*
     In some previous versions, the events were given other event type
@@ -4098,21 +4441,59 @@ Format_description_log_event(const char* buf,
 #ifndef MYSQL_CLIENT
 bool Format_description_log_event::write(IO_CACHE* file)
 {
+  bool ret;
+  bool no_checksum;
   /*
     We don't call Start_log_event_v3::write() because this would make 2
     my_b_safe_write().
   */
-  uchar buff[FORMAT_DESCRIPTION_HEADER_LEN];
+  uchar buff[FORMAT_DESCRIPTION_HEADER_LEN + BINLOG_CHECKSUM_ALG_DESC_LEN];
+  size_t rec_size= sizeof(buff);
   int2store(buff + ST_BINLOG_VER_OFFSET,binlog_version);
   memcpy((char*) buff + ST_SERVER_VER_OFFSET,server_version,ST_SERVER_VER_LEN);
   if (!dont_set_created)
-    created= when= get_time();
+    created= get_time();
   int4store(buff + ST_CREATED_OFFSET,created);
   buff[ST_COMMON_HEADER_LEN_OFFSET]= LOG_EVENT_HEADER_LEN;
-  memcpy((char*) buff+ST_COMMON_HEADER_LEN_OFFSET+1, (uchar*) post_header_len,
+  memcpy((char*) buff+ST_COMMON_HEADER_LEN_OFFSET + 1, (uchar*) post_header_len,
          LOG_EVENT_TYPES);
-  return (write_header(file, sizeof(buff)) ||
-          my_b_safe_write(file, buff, sizeof(buff)));
+  /*
+    if checksum is requested
+    record the checksum-algorithm descriptor next to
+    post_header_len vector which will be followed by the checksum value.
+    Master is supposed to trigger checksum computing by binlog_checksum_options,
+    slave does it via marking the event according to
+    FD_queue checksum_alg value.
+  */
+  compile_time_assert(sizeof(BINLOG_CHECKSUM_ALG_DESC_LEN == 1));
+#ifndef DBUG_OFF
+  data_written= 0; // to prepare for need_checksum assert
+#endif
+  buff[FORMAT_DESCRIPTION_HEADER_LEN]= need_checksum() ?
+    checksum_alg : (uint8) BINLOG_CHECKSUM_ALG_OFF;
+  /* 
+     FD of checksum-aware server is always checksum-equipped, (V) is in,
+     regardless of @@global.binlog_checksum policy.
+     Thereby a combination of (A) == 0, (V) != 0 means
+     it's the checksum-aware server's FD event that heads checksum-free binlog
+     file. 
+     Here 0 stands for checksumming OFF to evaluate (V) as 0 is that case.
+     A combination of (A) != 0, (V) != 0 denotes FD of the checksum-aware server
+     heading the checksummed binlog.
+     (A), (V) presence in FD of the checksum-aware server makes the event
+     1 + 4 bytes bigger comparing to the former FD.
+  */
+
+  if ((no_checksum= (checksum_alg == BINLOG_CHECKSUM_ALG_OFF)))
+  {
+    checksum_alg= BINLOG_CHECKSUM_ALG_CRC32;  // Forcing (V) room to fill anyway
+  }
+  ret= (write_header(file, rec_size) ||
+        wrapper_my_b_safe_write(file, buff, rec_size) ||
+        write_footer(file));
+  if (no_checksum)
+    checksum_alg= BINLOG_CHECKSUM_ALG_OFF;
+  return ret;
 }
 #endif
 
@@ -4207,6 +4588,30 @@ Format_description_log_event::do_shall_skip(Relay_log_info *rli)
 
 #endif
 
+static inline void
+do_server_version_split(char* version,
+                        Format_description_log_event::master_version_split *split_versions)
+{
+  char *p= version, *r;
+  ulong number;
+  for (uint i= 0; i<=2; i++)
+  {
+    number= strtoul(p, &r, 10);
+    split_versions->ver[i]= (uchar) number;
+    DBUG_ASSERT(number < 256); // fit in uchar
+    p= r;
+    DBUG_ASSERT(!((i == 0) && (*r != '.'))); // should be true in practice
+    if (*r == '.')
+      p++; // skip the dot
+  }
+  if (strstr(p, "MariaDB") != 0 || strstr(p, "-maria-") != 0)
+    split_versions->kind=
+      Format_description_log_event::master_version_split::KIND_MARIADB;
+  else
+    split_versions->kind=
+      Format_description_log_event::master_version_split::KIND_MYSQL;
+}
+
 
 /**
    Splits the event's 'server_version' string into three numeric pieces stored
@@ -4219,24 +4624,67 @@ Format_description_log_event::do_shall_skip(Relay_log_info *rli)
 */
 void Format_description_log_event::calc_server_version_split()
 {
-  char *p= server_version, *r;
-  ulong number;
-  for (uint i= 0; i<=2; i++)
-  {
-    number= strtoul(p, &r, 10);
-    server_version_split[i]= (uchar)number;
-    DBUG_ASSERT(number < 256); // fit in uchar
-    p= r;
-    DBUG_ASSERT(!((i == 0) && (*r != '.'))); // should be true in practice
-    if (*r == '.')
-      p++; // skip the dot
-  }
+  do_server_version_split(server_version, &server_version_split);
+
   DBUG_PRINT("info",("Format_description_log_event::server_version_split:"
                      " '%s' %d %d %d", server_version,
-                     server_version_split[0],
-                     server_version_split[1], server_version_split[2]));
+                     server_version_split.ver[0],
+                     server_version_split.ver[1], server_version_split.ver[2]));
+}
+
+static inline ulong
+version_product(const Format_description_log_event::master_version_split* version_split)
+{
+  return ((version_split->ver[0] * 256 + version_split->ver[1]) * 256
+          + version_split->ver[2]);
+}
+
+/**
+   @return TRUE is the event's version is earlier than one that introduced
+   the replication event checksum. FALSE otherwise.
+*/
+bool
+Format_description_log_event::is_version_before_checksum(const master_version_split
+                                                         *version_split)
+{
+  return version_product(version_split) <
+    (version_split->kind == master_version_split::KIND_MARIADB ?
+     checksum_version_product_mariadb : checksum_version_product_mysql);
 }
 
+/**
+   @param buf buffer holding serialized FD event
+   @param len netto (possible checksum is stripped off) length of the event buf
+   
+   @return  the version-safe checksum alg descriptor where zero
+            designates no checksum, 255 - the orginator is
+            checksum-unaware (effectively no checksum) and the actuall
+            [1-254] range alg descriptor.
+*/
+uint8 get_checksum_alg(const char* buf, ulong len)
+{
+  uint8 ret;
+  char version[ST_SERVER_VER_LEN];
+  Format_description_log_event::master_version_split version_split;
+
+  DBUG_ENTER("get_checksum_alg");
+  DBUG_ASSERT(buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT);
+
+  memcpy(version, buf +
+         buf[LOG_EVENT_MINIMAL_HEADER_LEN + ST_COMMON_HEADER_LEN_OFFSET]
+         + ST_SERVER_VER_OFFSET, ST_SERVER_VER_LEN);
+  version[ST_SERVER_VER_LEN - 1]= 0;
+  
+  do_server_version_split(version, &version_split);
+  ret= Format_description_log_event::is_version_before_checksum(&version_split) ?
+    (uint8) BINLOG_CHECKSUM_ALG_UNDEF :
+    * (uint8*) (buf + len - BINLOG_CHECKSUM_LEN - BINLOG_CHECKSUM_ALG_DESC_LEN);
+  DBUG_ASSERT(ret == BINLOG_CHECKSUM_ALG_OFF ||
+              ret == BINLOG_CHECKSUM_ALG_UNDEF ||
+              ret == BINLOG_CHECKSUM_ALG_CRC32);
+  DBUG_RETURN(ret);
+}
+  
 
   /**************************************************************************
         Load_log_event methods
@@ -4543,8 +4991,8 @@ Load_log_event::Load_log_event(const char *buf, uint event_len,
   */
   if (event_len)
     copy_log_event(buf, event_len,
-                   ((buf[EVENT_TYPE_OFFSET] == LOAD_EVENT) ?
-                    LOAD_HEADER_LEN + 
+                   (((uchar)buf[EVENT_TYPE_OFFSET] == LOAD_EVENT) ?
+                   LOAD_HEADER_LEN + 
                     description_event->common_header_len :
                     LOAD_HEADER_LEN + LOG_EVENT_HEADER_LEN),
                    description_event);
@@ -4581,7 +5029,7 @@ int Load_log_event::copy_log_event(const char *buf, ulong event_len,
   */
   if (!(field_lens= (uchar*)sql_ex.init((char*)buf + body_offset,
                                         buf_end,
-                                        buf[EVENT_TYPE_OFFSET] != LOAD_EVENT)))
+                                        (uchar)buf[EVENT_TYPE_OFFSET] != LOAD_EVENT)))
     DBUG_RETURN(1);
   
   data_len = event_len - body_offset;
@@ -4819,7 +5267,7 @@ int Load_log_event::do_apply_event(NET* net, Relay_log_info const *rli,
   */
   if (rpl_filter->db_ok(thd->db))
   {
-    thd->set_time((time_t)when);
+    thd->set_time(when, when_sec_part);
     thd->set_query_id(next_query_id());
     thd->warning_info->opt_clear_warning_info(thd->query_id);
 
@@ -5093,6 +5541,7 @@ Rotate_log_event::Rotate_log_event(const char* new_log_ident_arg,
   DBUG_PRINT("enter",("new_log_ident: %s  pos: %s  flags: %lu", new_log_ident_arg,
                       llstr(pos_arg, buff), (ulong) flags));
 #endif
+  cache_type= EVENT_NO_CACHE;
   if (flags & DUP_NAME)
     new_log_ident= my_strndup(new_log_ident_arg, ident_len, MYF(MY_WME));
   if (flags & RELAY_LOG)
@@ -5134,9 +5583,11 @@ bool Rotate_log_event::write(IO_CACHE* file)
 {
   char buf[ROTATE_HEADER_LEN];
   int8store(buf + R_POS_OFFSET, pos);
-  return (write_header(file, ROTATE_HEADER_LEN + ident_len) ||
-          my_b_safe_write(file, (uchar*)buf, ROTATE_HEADER_LEN) ||
-          my_b_safe_write(file, (uchar*)new_log_ident, (uint) ident_len));
+  return (write_header(file, ROTATE_HEADER_LEN + ident_len) || 
+          wrapper_my_b_safe_write(file, (uchar*) buf, ROTATE_HEADER_LEN) ||
+          wrapper_my_b_safe_write(file, (uchar*) new_log_ident,
+                                     (uint) ident_len) ||
+          write_footer(file));
 }
 #endif
 
@@ -5305,7 +5756,8 @@ bool Intvar_log_event::write(IO_CACHE* file)
   buf[I_TYPE_OFFSET]= (uchar) type;
   int8store(buf + I_VAL_OFFSET, val);
   return (write_header(file, sizeof(buf)) ||
-          my_b_safe_write(file, buf, sizeof(buf)));
+          wrapper_my_b_safe_write(file, buf, sizeof(buf)) ||
+	  write_footer(file));
 }
 #endif
 
@@ -5433,7 +5885,8 @@ bool Rand_log_event::write(IO_CACHE* file)
   int8store(buf + RAND_SEED1_OFFSET, seed1);
   int8store(buf + RAND_SEED2_OFFSET, seed2);
   return (write_header(file, sizeof(buf)) ||
-          my_b_safe_write(file, buf, sizeof(buf)));
+          wrapper_my_b_safe_write(file, buf, sizeof(buf)) ||
+	  write_footer(file));
 }
 #endif
 
@@ -5535,8 +5988,9 @@ Xid_log_event(const char* buf,
 bool Xid_log_event::write(IO_CACHE* file)
 {
   DBUG_EXECUTE_IF("do_not_write_xid", return 0;);
-  return write_header(file, sizeof(xid)) ||
-         my_b_safe_write(file, (uchar*) &xid, sizeof(xid));
+  return (write_header(file, sizeof(xid)) ||
+	  wrapper_my_b_safe_write(file, (uchar*) &xid, sizeof(xid)) ||
+	  write_footer(file));
 }
 #endif
 
@@ -5715,8 +6169,21 @@ User_var_log_event(const char* buf,
       we keep the flags set to UNDEF_F.
     */
     uint bytes_read= ((val + val_len) - start);
-    DBUG_ASSERT(bytes_read==data_written || 
-                bytes_read==(data_written-1));
+#ifndef DBUG_OFF
+    bool old_pre_checksum_fd= description_event->is_version_before_checksum(
+        &description_event->server_version_split);
+#endif
+    DBUG_ASSERT((bytes_read == data_written -
+                 (old_pre_checksum_fd ||
+                  (description_event->checksum_alg ==
+                   BINLOG_CHECKSUM_ALG_OFF)) ?
+                 0 : BINLOG_CHECKSUM_LEN)
+                ||
+                (bytes_read == data_written -1 -
+                 (old_pre_checksum_fd ||
+                  (description_event->checksum_alg ==
+                   BINLOG_CHECKSUM_ALG_OFF)) ?
+                 0 : BINLOG_CHECKSUM_LEN));
     if ((data_written - bytes_read) > 0)
     {
       flags= (uint) *(buf + UV_VAL_IS_NULL + UV_VAL_TYPE_SIZE +
@@ -5784,11 +6251,12 @@ bool User_var_log_event::write(IO_CACHE* file)
   event_length= sizeof(buf)+ name_len + buf1_length + val_len + unsigned_len;
 
   return (write_header(file, event_length) ||
-          my_b_safe_write(file, (uchar*) buf, sizeof(buf))   ||
-          my_b_safe_write(file, (uchar*) name, name_len)     ||
-          my_b_safe_write(file, (uchar*) buf1, buf1_length) ||
-          my_b_safe_write(file, pos, val_len) ||
-          my_b_safe_write(file, &flags, unsigned_len));
+          wrapper_my_b_safe_write(file, (uchar*) buf, sizeof(buf))   ||
+	  wrapper_my_b_safe_write(file, (uchar*) name, name_len)     ||
+	  wrapper_my_b_safe_write(file, (uchar*) buf1, buf1_length) ||
+	  wrapper_my_b_safe_write(file, pos, val_len) ||
+          wrapper_my_b_safe_write(file, &flags, unsigned_len) ||
+          write_footer(file));
 }
 #endif
 
@@ -6315,7 +6783,7 @@ Create_file_log_event::Create_file_log_event(const char* buf, uint len,
   uint8 create_file_header_len= description_event->post_header_len[CREATE_FILE_EVENT-1];
   if (!(event_buf= (char*) my_memdup(buf, len, MYF(MY_WME))) ||
       copy_log_event(event_buf,len,
-                     ((buf[EVENT_TYPE_OFFSET] == LOAD_EVENT) ?
+                     (((uchar)buf[EVENT_TYPE_OFFSET] == LOAD_EVENT) ?
                       load_header_len + header_len :
                       (fake_base ? (header_len+load_header_len) :
                        (header_len+load_header_len) +
@@ -6554,8 +7022,9 @@ bool Append_block_log_event::write(IO_CACHE* file)
   uchar buf[APPEND_BLOCK_HEADER_LEN];
   int4store(buf + AB_FILE_ID_OFFSET, file_id);
   return (write_header(file, APPEND_BLOCK_HEADER_LEN + block_len) ||
-          my_b_safe_write(file, buf, APPEND_BLOCK_HEADER_LEN) ||
-	  my_b_safe_write(file, (uchar*) block, block_len));
+          wrapper_my_b_safe_write(file, buf, APPEND_BLOCK_HEADER_LEN) ||
+	  wrapper_my_b_safe_write(file, (uchar*) block, block_len) ||
+	  write_footer(file));
 }
 #endif
 
@@ -6713,7 +7182,8 @@ bool Delete_file_log_event::write(IO_CACHE* file)
  uchar buf[DELETE_FILE_HEADER_LEN];
  int4store(buf + DF_FILE_ID_OFFSET, file_id);
  return (write_header(file, sizeof(buf)) ||
-         my_b_safe_write(file, buf, sizeof(buf)));
+         wrapper_my_b_safe_write(file, buf, sizeof(buf)) ||
+	 write_footer(file));
 }
 #endif
 
@@ -6810,7 +7280,8 @@ bool Execute_load_log_event::write(IO_CACHE* file)
   uchar buf[EXEC_LOAD_HEADER_LEN];
   int4store(buf + EL_FILE_ID_OFFSET, file_id);
   return (write_header(file, sizeof(buf)) || 
-          my_b_safe_write(file, buf, sizeof(buf)));
+          wrapper_my_b_safe_write(file, buf, sizeof(buf)) ||
+	  write_footer(file));
 }
 #endif
 
@@ -6872,16 +7343,17 @@ int Execute_load_log_event::do_apply_event(Relay_log_info const *rli)
                 fname);
     goto err;
   }
-  if (!(lev = (Load_log_event*)Log_event::read_log_event(&file,
-                                                         (mysql_mutex_t*)0,
-                                                         rli->relay_log.description_event_for_exec)) ||
+  if (!(lev= (Load_log_event*)
+        Log_event::read_log_event(&file,
+                                  (mysql_mutex_t*)0,
+                                  rli->relay_log.description_event_for_exec,
+                                  opt_slave_sql_verify_checksum)) ||
       lev->get_type_code() != NEW_LOAD_EVENT)
   {
     rli->report(ERROR_LEVEL, 0, "Error in Exec_load event: "
                     "file '%s' appears corrupted", fname);
     goto err;
   }
-
   lev->thd = thd;
   /*
     lev->do_apply_event should use rli only for errors i.e. should
@@ -7044,7 +7516,7 @@ Execute_load_query_log_event::write_post_header_for_derived(IO_CACHE* file)
   int4store(buf + 4, fn_pos_start);
   int4store(buf + 4 + 4, fn_pos_end);
   *(buf + 4 + 4 + 4)= (uchar) dup_handling;
-  return my_b_safe_write(file, buf, EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN);
+  return wrapper_my_b_safe_write(file, buf, EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN);
 }
 #endif
 
@@ -7280,7 +7752,8 @@ Rows_log_event::Rows_log_event(THD *thd_arg, TABLE *tbl_arg, ulong tid,
     m_width(tbl_arg ? tbl_arg->s->fields : 1),
     m_rows_buf(0), m_rows_cur(0), m_rows_end(0), m_flags(0) 
 #ifdef HAVE_REPLICATION
-    , m_curr_row(NULL), m_curr_row_end(NULL), m_key(NULL)
+    , m_curr_row(NULL), m_curr_row_end(NULL),
+    m_key(NULL), m_key_info(NULL), m_key_nr(0)
 #endif
 {
   /*
@@ -7328,7 +7801,8 @@ Rows_log_event::Rows_log_event(const char *buf, uint event_len,
 #endif
     m_table_id(0), m_rows_buf(0), m_rows_cur(0), m_rows_end(0)
 #if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
-    , m_curr_row(NULL), m_curr_row_end(NULL), m_key(NULL)
+    , m_curr_row(NULL), m_curr_row_end(NULL),
+    m_key(NULL), m_key_info(NULL), m_key_nr(0)
 #endif
 {
   DBUG_ENTER("Rows_log_event::Rows_log_event(const char*,...)");
@@ -7678,7 +8152,7 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
       const_cast<Relay_log_info*>(rli)->m_table_map.set_table(ptr->table_id, ptr->table);
     }
 #ifdef HAVE_QUERY_CACHE
-    query_cache.invalidate_locked_for_write(rli->tables_to_lock);
+    query_cache.invalidate_locked_for_write(thd, rli->tables_to_lock);
 #endif
   }
 
@@ -7705,7 +8179,7 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
       TIMESTAMP column to a table with one.
       So we call set_time(), like in SBR. Presently it changes nothing.
     */
-    thd->set_time((time_t)when);
+    thd->set_time(when, when_sec_part);
 
     /*
       Now we are in a statement and will stay in a statement until we
@@ -8007,11 +8481,11 @@ bool Rows_log_event::write_data_header(IO_CACHE *file)
                   {
                     int4store(buf + 0, m_table_id);
                     int2store(buf + 4, m_flags);
-                    return (my_b_safe_write(file, buf, 6));
+                    return (wrapper_my_b_safe_write(file, buf, 6));
                   });
   int6store(buf + RW_MAPID_OFFSET, (ulonglong)m_table_id);
   int2store(buf + RW_FLAGS_OFFSET, m_flags);
-  return (my_b_safe_write(file, buf, ROWS_HEADER_LEN));
+  return (wrapper_my_b_safe_write(file, buf, ROWS_HEADER_LEN));
 }
 
 bool Rows_log_event::write_data_body(IO_CACHE*file)
@@ -8027,10 +8501,10 @@ bool Rows_log_event::write_data_body(IO_CACHE*file)
   DBUG_ASSERT(static_cast<size_t>(sbuf_end - sbuf) <= sizeof(sbuf));
 
   DBUG_DUMP("m_width", sbuf, (size_t) (sbuf_end - sbuf));
-  res= res || my_b_safe_write(file, sbuf, (size_t) (sbuf_end - sbuf));
+  res= res || wrapper_my_b_safe_write(file, sbuf, (size_t) (sbuf_end - sbuf));
 
   DBUG_DUMP("m_cols", (uchar*) m_cols.bitmap, no_bytes_in_map(&m_cols));
-  res= res || my_b_safe_write(file, (uchar*) m_cols.bitmap,
+  res= res || wrapper_my_b_safe_write(file, (uchar*) m_cols.bitmap,
                               no_bytes_in_map(&m_cols));
   /*
     TODO[refactor write]: Remove the "down cast" here (and elsewhere).
@@ -8039,11 +8513,11 @@ bool Rows_log_event::write_data_body(IO_CACHE*file)
   {
     DBUG_DUMP("m_cols_ai", (uchar*) m_cols_ai.bitmap,
               no_bytes_in_map(&m_cols_ai));
-    res= res || my_b_safe_write(file, (uchar*) m_cols_ai.bitmap,
+    res= res || wrapper_my_b_safe_write(file, (uchar*) m_cols_ai.bitmap,
                                 no_bytes_in_map(&m_cols_ai));
   }
   DBUG_DUMP("rows", m_rows_buf, data_size);
-  res= res || my_b_safe_write(file, m_rows_buf, (size_t) data_size);
+  res= res || wrapper_my_b_safe_write(file, m_rows_buf, (size_t) data_size);
 
   return res;
 
@@ -8088,6 +8562,144 @@ void Rows_log_event::print_helper(FILE *file,
 #endif
 
 /**************************************************************************
+	Annotate_rows_log_event member functions
+**************************************************************************/
+
+#ifndef MYSQL_CLIENT
+Annotate_rows_log_event::Annotate_rows_log_event(THD *thd,
+                                                 uint16 cache_type_arg)
+  : Log_event(thd, 0, true),
+    m_save_thd_query_txt(0),
+    m_save_thd_query_len(0)
+{
+  m_query_txt= thd->query();
+  m_query_len= thd->query_length();
+  cache_type= cache_type_arg;
+}
+#endif
+
+Annotate_rows_log_event::Annotate_rows_log_event(const char *buf,
+                                                 uint event_len,
+                                      const Format_description_log_event *desc)
+  : Log_event(buf, desc),
+    m_save_thd_query_txt(0),
+    m_save_thd_query_len(0)
+{
+  m_query_len= event_len - desc->common_header_len;
+  m_query_txt= (char*) buf + desc->common_header_len;
+}
+
+Annotate_rows_log_event::~Annotate_rows_log_event()
+{
+#ifndef MYSQL_CLIENT
+  if (m_save_thd_query_txt)
+    thd->set_query(m_save_thd_query_txt, m_save_thd_query_len);
+#endif
+}
+
+int Annotate_rows_log_event::get_data_size()
+{
+  return m_query_len;
+}
+
+Log_event_type Annotate_rows_log_event::get_type_code()
+{
+  return ANNOTATE_ROWS_EVENT;
+}
+
+bool Annotate_rows_log_event::is_valid() const
+{
+  return (m_query_txt != NULL && m_query_len != 0);
+}
+
+#ifndef MYSQL_CLIENT
+bool Annotate_rows_log_event::write_data_header(IO_CACHE *file)
+{ 
+  return 0;
+}
+#endif
+
+#ifndef MYSQL_CLIENT
+bool Annotate_rows_log_event::write_data_body(IO_CACHE *file)
+{
+  return wrapper_my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+}
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+void Annotate_rows_log_event::pack_info(Protocol* protocol)
+{
+  if (m_query_txt && m_query_len)
+    protocol->store(m_query_txt, m_query_len, &my_charset_bin);
+}
+#endif
+
+#ifdef MYSQL_CLIENT
+void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+{
+  if (pinfo->short_form)
+    return;
+
+  print_header(&pinfo->head_cache, pinfo, TRUE);
+  my_b_printf(&pinfo->head_cache, "\tAnnotate_rows:\n");
+
+  char *pbeg;   // beginning of the next line
+  char *pend;   // end of the next line
+  uint cnt= 0;  // characters counter
+
+  for (pbeg= m_query_txt; ; pbeg= pend)
+  {
+    // skip all \r's and \n's at the beginning of the next line
+    for (;; pbeg++)
+    {
+      if (++cnt > m_query_len)
+        return;
+
+      if (*pbeg != '\r' && *pbeg != '\n')
+        break;
+    }
+
+    // find end of the next line
+    for (pend= pbeg + 1;
+         ++cnt <= m_query_len && *pend != '\r' && *pend != '\n';
+         pend++)
+      ;
+
+    // print next line
+    my_b_write(&pinfo->head_cache, (const uchar*) "#Q> ", 4);
+    my_b_write(&pinfo->head_cache, (const uchar*) pbeg, pend - pbeg);
+    my_b_write(&pinfo->head_cache, (const uchar*) "\n", 1);
+  }
+}
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+int Annotate_rows_log_event::do_apply_event(Relay_log_info const *rli)
+{
+  m_save_thd_query_txt= thd->query();
+  m_save_thd_query_len= thd->query_length();
+  thd->set_query(m_query_txt, m_query_len);
+  return 0;
+}
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+int Annotate_rows_log_event::do_update_pos(Relay_log_info *rli)
+{
+  rli->inc_event_relay_log_pos();
+  return 0;
+}
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+Log_event::enum_skip_reason
+Annotate_rows_log_event::do_shall_skip(Relay_log_info *rli)
+{
+  return continue_group(rli);
+}
+#endif
+
+/**************************************************************************
 	Table_map_log_event member functions and support functions
 **************************************************************************/
 
@@ -8587,11 +9199,11 @@ bool Table_map_log_event::write_data_header(IO_CACHE *file)
                   {
                     int4store(buf + 0, m_table_id);
                     int2store(buf + 4, m_flags);
-                    return (my_b_safe_write(file, buf, 6));
+                    return (wrapper_my_b_safe_write(file, buf, 6));
                   });
   int6store(buf + TM_MAPID_OFFSET, (ulonglong)m_table_id);
   int2store(buf + TM_FLAGS_OFFSET, m_flags);
-  return (my_b_safe_write(file, buf, TABLE_MAP_HEADER_LEN));
+  return (wrapper_my_b_safe_write(file, buf, TABLE_MAP_HEADER_LEN));
 }
 
 bool Table_map_log_event::write_data_body(IO_CACHE *file)
@@ -8615,15 +9227,15 @@ bool Table_map_log_event::write_data_body(IO_CACHE *file)
   uchar mbuf[sizeof(m_field_metadata_size)];
   uchar *const mbuf_end= net_store_length(mbuf, m_field_metadata_size);
 
-  return (my_b_safe_write(file, dbuf,      sizeof(dbuf)) ||
-          my_b_safe_write(file, (const uchar*)m_dbnam,   m_dblen+1) ||
-          my_b_safe_write(file, tbuf,      sizeof(tbuf)) ||
-          my_b_safe_write(file, (const uchar*)m_tblnam,  m_tbllen+1) ||
-          my_b_safe_write(file, cbuf, (size_t) (cbuf_end - cbuf)) ||
-          my_b_safe_write(file, m_coltype, m_colcnt) ||
-          my_b_safe_write(file, mbuf, (size_t) (mbuf_end - mbuf)) ||
-          my_b_safe_write(file, m_field_metadata, m_field_metadata_size),
-          my_b_safe_write(file, m_null_bits, (m_colcnt + 7) / 8));
+  return (wrapper_my_b_safe_write(file, dbuf,      sizeof(dbuf)) ||
+          wrapper_my_b_safe_write(file, (const uchar*)m_dbnam,   m_dblen+1) ||
+          wrapper_my_b_safe_write(file, tbuf,      sizeof(tbuf)) ||
+          wrapper_my_b_safe_write(file, (const uchar*)m_tblnam,  m_tbllen+1) ||
+          wrapper_my_b_safe_write(file, cbuf, (size_t) (cbuf_end - cbuf)) ||
+          wrapper_my_b_safe_write(file, m_coltype, m_colcnt) ||
+          wrapper_my_b_safe_write(file, mbuf, (size_t) (mbuf_end - mbuf)) ||
+          wrapper_my_b_safe_write(file, m_field_metadata, m_field_metadata_size),
+          wrapper_my_b_safe_write(file, m_null_bits, (m_colcnt + 7) / 8));
  }
 #endif
 
@@ -9224,6 +9836,86 @@ record_compare_exit:
   return result;
 }
 
+
+/**
+  Find the best key to use when locating the row in @c find_row().
+
+  A primary key is preferred if it exists; otherwise a unique index is
+  preferred. Else we pick the index with the smalles rec_per_key value.
+
+  If a suitable key is found, set @c m_key, @c m_key_nr and @c m_key_info
+  member fields appropriately.
+
+  @returns Error code on failure, 0 on success.
+*/
+int Rows_log_event::find_key()
+{
+  uint i, best_key_nr, last_part;
+  KEY *key, *best_key;
+  ulong best_rec_per_key, tmp;
+  DBUG_ENTER("Rows_log_event::find_key");
+  DBUG_ASSERT(m_table);
+
+  best_key_nr= MAX_KEY;
+  LINT_INIT(best_key);
+  LINT_INIT(best_rec_per_key);
+
+  /*
+    Keys are sorted so that any primary key is first, followed by unique keys,
+    followed by any other. So we will automatically pick the primary key if
+    it exists.
+  */
+  for (i= 0, key= m_table->key_info; i < m_table->s->keys; i++, key++)
+  {
+    if (!m_table->s->keys_in_use.is_set(i))
+      continue;
+    /*
+      We cannot use a unique key with NULL-able columns to uniquely identify
+      a row (but we can still select it for range scan below if nothing better
+      is available).
+    */
+    if ((key->flags & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME)
+    {
+      best_key_nr= i;
+      best_key= key;
+      break;
+    }
+    /*
+      We can only use a non-unique key if it allows range scans (ie. skip
+      FULLTEXT indexes and such).
+    */
+    last_part= key->key_parts - 1;
+    DBUG_PRINT("info", ("Index %s rec_per_key[%u]= %lu",
+                        key->name, last_part, key->rec_per_key[last_part]));
+    if (!(m_table->file->index_flags(i, last_part, 1) & HA_READ_NEXT))
+      continue;
+
+    tmp= key->rec_per_key[last_part];
+    if (best_key_nr == MAX_KEY || (tmp > 0 && tmp < best_rec_per_key))
+    {
+      best_key_nr= i;
+      best_key= key;
+      best_rec_per_key= tmp;
+    }
+  }
+
+  if (best_key_nr == MAX_KEY)
+  {
+    m_key_info= NULL;
+    DBUG_RETURN(0);
+  }
+
+  // Allocate buffer for key searches
+  m_key= (uchar *) my_malloc(best_key->key_length, MYF(MY_WME));
+  if (m_key == NULL)
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+  m_key_info= best_key;
+  m_key_nr= best_key_nr;
+
+  DBUG_RETURN(0);;
+}
+
+
 /**
   Locate the current row in event's table.
 
@@ -9323,12 +10015,17 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
    */ 
   store_record(table,record[1]);    
 
-  if (table->s->keys > 0 && table->s->keys_in_use.is_set(0))
+  if (m_key_info)
   {
-    DBUG_PRINT("info",("locating record using primary key (index_read)"));
+    DBUG_PRINT("info",("locating record using key #%u [%s] (index_read)",
+                       m_key_nr, m_key_info->name));
+    /* We use this to test that the correct key is used in test cases. */
+    DBUG_EXECUTE_IF("slave_crash_if_wrong_index",
+                    if(0 != strcmp(m_key_info->name,"expected_key")) abort(););
 
-    /* The 0th key is active: search the table using the index */
-    if (!table->file->inited && (error= table->file->ha_index_init(0, FALSE)))
+    /* The key is active: search the table using the index */
+    if (!table->file->inited &&
+        (error= table->file->ha_index_init(m_key_nr, FALSE)))
     {
       DBUG_PRINT("info",("ha_index_init returns error %d",error));
       table->file->print_error(error, MYF(0));
@@ -9338,14 +10035,14 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
     /* Fill key data for the row */
 
     DBUG_ASSERT(m_key);
-    key_copy(m_key, table->record[0], table->key_info, 0);
+    key_copy(m_key, table->record[0], m_key_info, 0);
 
     /*
       Don't print debug messages when running valgrind since they can
       trigger false warnings.
      */
 #ifndef HAVE_valgrind
-    DBUG_DUMP("key data", m_key, table->key_info->key_length);
+    DBUG_DUMP("key data", m_key, m_key_info->key_length);
 #endif
 
     /*
@@ -9431,6 +10128,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
       record we are looking for is stored in record[1].
      */ 
     DBUG_PRINT("info",("non-unique index, scanning it to find matching record")); 
+    /* We use this to test that the correct key is used in test cases. */
+    DBUG_EXECUTE_IF("slave_crash_if_index_scan", abort(););
 
     while (record_compare(table))
     {
@@ -9469,6 +10168,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
   else
   {
     DBUG_PRINT("info",("locating record using table scan (rnd_next)"));
+    /* We use this to test that the correct key is used in test cases. */
+    DBUG_EXECUTE_IF("slave_crash_if_table_scan", abort(););
 
     int restart_count= 0; // Number of times scanning has restarted from top
 
@@ -9588,14 +10289,7 @@ Delete_rows_log_event::do_before_row_operations(const Slave_reporting_capability
     return 0;
   }
 
-  if (m_table->s->keys > 0)
-  {
-    // Allocate buffer for key searches
-    m_key= (uchar*)my_malloc(m_table->key_info->key_length, MYF(MY_WME));
-    if (!m_key)
-      return HA_ERR_OUT_OF_MEM;
-  }
-  return 0;
+  return find_key();
 }
 
 int 
@@ -9606,6 +10300,7 @@ Delete_rows_log_event::do_after_row_operations(const Slave_reporting_capability
   m_table->file->ha_index_or_rnd_end();
   my_free(m_key);
   m_key= NULL;
+  m_key_info= NULL;
 
   return error;
 }
@@ -9708,13 +10403,9 @@ Update_rows_log_event::Update_rows_log_event(const char *buf, uint event_len,
 int 
 Update_rows_log_event::do_before_row_operations(const Slave_reporting_capability *const)
 {
-  if (m_table->s->keys > 0)
-  {
-    // Allocate buffer for key searches
-    m_key= (uchar*)my_malloc(m_table->key_info->key_length, MYF(MY_WME));
-    if (!m_key)
-      return HA_ERR_OUT_OF_MEM;
-  }
+  int err;
+  if ((err= find_key()))
+    return err;
 
   m_table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET;
 
@@ -9729,6 +10420,7 @@ Update_rows_log_event::do_after_row_operations(const Slave_reporting_capability
   m_table->file->ha_index_or_rnd_end();
   my_free(m_key); // Free for multi_malloc
   m_key= NULL;
+  m_key_info= NULL;
 
   return error;
 }
@@ -9904,13 +10596,25 @@ Incident_log_event::write_data_header(IO_CACHE *file)
   DBUG_PRINT("enter", ("m_incident: %d", m_incident));
   uchar buf[sizeof(int16)];
   int2store(buf, (int16) m_incident);
-  DBUG_RETURN(my_b_safe_write(file, buf, sizeof(buf)));
+#ifndef MYSQL_CLIENT
+  DBUG_RETURN(wrapper_my_b_safe_write(file, buf, sizeof(buf)));
+#else
+   DBUG_RETURN(my_b_safe_write(file, buf, sizeof(buf)));
+#endif
 }
 
 bool
 Incident_log_event::write_data_body(IO_CACHE *file)
 {
+  uchar tmp[1];
   DBUG_ENTER("Incident_log_event::write_data_body");
+  tmp[0]= (uchar) m_message.length;
+  crc= my_checksum(crc, (uchar*) tmp, 1);
+  if (m_message.length > 0)
+  {
+    crc= my_checksum(crc, (uchar*) m_message.str, m_message.length);
+    // todo: report a bug on write_str accepts uint but treats it as uchar
+  }
   DBUG_RETURN(write_str(file, m_message.str, (uint) m_message.length));
 }
 
diff --git a/sql/log_event.h b/sql/log_event.h
index 3a54702c2d1..48c781a04fb 100644
--- a/sql/log_event.h
+++ b/sql/log_event.h
@@ -77,6 +77,7 @@ class String;
 #define LOG_READ_MEM    -5
 #define LOG_READ_TRUNC  -6
 #define LOG_READ_TOO_LARGE -7
+#define LOG_READ_CHECKSUM_FAILURE -8
 
 #define LOG_EVENT_OFFSET 4
 
@@ -256,6 +257,8 @@ struct sql_ex_info
 #define EXECUTE_LOAD_QUERY_HEADER_LEN  (QUERY_HEADER_LEN + EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN)
 #define INCIDENT_HEADER_LEN    2
 #define HEARTBEAT_HEADER_LEN   0
+#define ANNOTATE_ROWS_HEADER_LEN  0
+
 /* 
   Max number of possible extra bytes in a replication event compared to a
   packet (i.e. a query) sent from client to master;
@@ -271,6 +274,7 @@ struct sql_ex_info
                                    1 + 2          /* type, charset_database_number */ + \
                                    1 + 8          /* type, table_map_for_update */ + \
                                    1 + 4          /* type, master_data_written */ + \
+                                   1 + 3          /* type, sec_part of NOW() */ + \
                                    1 + 16 + 1 + 60/* type, user_len, user, host_len, host */)
 #define MAX_LOG_EVENT_HEADER   ( /* in order of Query_log_event::write */ \
   LOG_EVENT_HEADER_LEN + /* write_header */ \
@@ -342,6 +346,8 @@ struct sql_ex_info
 
 #define Q_INVOKER 11
 
+#define Q_HRNOW 128
+
 /* Intvar event post-header */
 
 /* Intvar event data */
@@ -530,6 +536,22 @@ struct sql_ex_info
 #endif
 #undef EXPECTED_OPTIONS         /* You shouldn't use this one */
 
+enum enum_binlog_checksum_alg {
+  BINLOG_CHECKSUM_ALG_OFF= 0,    // Events are without checksum though its generator
+                                 // is checksum-capable New Master (NM).
+  BINLOG_CHECKSUM_ALG_CRC32= 1,  // CRC32 of zlib algorithm.
+  BINLOG_CHECKSUM_ALG_ENUM_END,  // the cut line: valid alg range is [1, 0x7f].
+  BINLOG_CHECKSUM_ALG_UNDEF= 255 // special value to tag undetermined yet checksum
+                                 // or events from checksum-unaware servers
+};
+
+#define CHECKSUM_CRC32_SIGNATURE_LEN 4
+/**
+   defined statically while there is just one alg implemented
+*/
+#define BINLOG_CHECKSUM_LEN CHECKSUM_CRC32_SIGNATURE_LEN
+#define BINLOG_CHECKSUM_ALG_DESC_LEN 1  /* 1 byte checksum alg descriptor */
+
 /**
   @enum Log_event_type
 
@@ -599,6 +621,15 @@ enum Log_event_type
     Existing events (except ENUM_END_EVENT) should never change their numbers
   */
 
+  /* New MySQL/Sun events are to be added right above this comment */
+  MYSQL_EVENTS_END,
+
+  MARIA_EVENTS_BEGIN= 160,
+  /* New Maria event numbers start from here */
+  ANNOTATE_ROWS_EVENT= 160,
+
+  /* Add new MariaDB events here - right above this comment!  */
+
   ENUM_END_EVENT /* end marker */
 };
 
@@ -945,7 +976,8 @@ public:
     execution time, which guarantees good replication (otherwise, we
     could have a query and its event with different timestamps).
   */
-  time_t when;
+  my_time_t when;
+  ulong     when_sec_part;
   /* The number of seconds the query took to run on the master. */
   ulong exec_time;
   /* Number of bytes written by write() function */
@@ -963,11 +995,7 @@ public:
     LOG_EVENT_SUPPRESS_USE_F for notes.
   */
   uint16 flags;
-  
-  /*
-    Defines the type of the cache, if any, where the event will be
-    stored before being flushed to disk.
-  */
+
   uint16 cache_type;
 
   /**
@@ -976,6 +1004,11 @@ public:
   */
   ulong slave_exec_mode;
 
+  /**
+    Placeholder for event checksum while writing to binlog.
+   */
+  ha_checksum crc;
+
 #ifdef MYSQL_SERVER
   THD* thd;
 
@@ -995,9 +1028,10 @@ public:
   static Log_event* read_log_event(IO_CACHE* file,
                                    mysql_mutex_t* log_lock,
                                    const Format_description_log_event
-                                   *description_event);
+                                   *description_event,
+                                   my_bool crc_check);
   static int read_log_event(IO_CACHE* file, String* packet,
-                            mysql_mutex_t* log_lock);
+                            mysql_mutex_t* log_lock, uint8 checksum_alg_arg);
   /*
     init_show_field_list() prepares the column names and types for the
     output of SHOW BINLOG EVENTS; it is used only by SHOW BINLOG
@@ -1024,7 +1058,7 @@ public:
     /* avoid having to link mysqlbinlog against libpthread */
   static Log_event* read_log_event(IO_CACHE* file,
                                    const Format_description_log_event
-                                   *description_event);
+                                   *description_event, my_bool crc_check);
   /* print*() functions are used by mysqlbinlog */
   virtual void print(FILE* file, PRINT_EVENT_INFO* print_event_info) = 0;
   void print_timestamp(IO_CACHE* file, time_t *ts = 0);
@@ -1033,6 +1067,15 @@ public:
   void print_base64(IO_CACHE* file, PRINT_EVENT_INFO* print_event_info,
                     bool is_more);
 #endif
+  /* 
+     The value is set by caller of FD constructor and
+     Log_event::write_header() for the rest.
+     In the FD case it's propagated into the last byte 
+     of post_header_len[] at FD::write().
+     On the slave side the value is assigned from post_header_len[last] 
+     of the last seen FD event.
+  */
+  uint8 checksum_alg;
 
   static void *operator new(size_t size)
   {
@@ -1047,29 +1090,46 @@ public:
   /* Placement version of the above operators */
   static void *operator new(size_t, void* ptr) { return ptr; }
   static void operator delete(void*, void*) { }
+  bool wrapper_my_b_safe_write(IO_CACHE* file, const uchar* buf, ulong data_length);
 
 #ifdef MYSQL_SERVER
   bool write_header(IO_CACHE* file, ulong data_length);
+  bool write_footer(IO_CACHE* file);
+  my_bool need_checksum();
+
   virtual bool write(IO_CACHE* file)
   {
-    return (write_header(file, get_data_size()) ||
-            write_data_header(file) ||
-            write_data_body(file));
+    return(write_header(file, get_data_size()) ||
+	   write_data_header(file) ||
+	   write_data_body(file) ||
+	   write_footer(file));
   }
   virtual bool write_data_header(IO_CACHE* file)
   { return 0; }
   virtual bool write_data_body(IO_CACHE* file __attribute__((unused)))
   { return 0; }
-  inline time_t get_time()
+  inline my_time_t get_time()
   {
     THD *tmp_thd;
     if (when)
       return when;
     if (thd)
-      return thd->start_time;
+    {
+      when= thd->start_time;
+      when_sec_part= thd->start_time_sec_part;
+      return when;
+    }
+    /* thd will only be 0 here at time of log creation */
     if ((tmp_thd= current_thd))
-      return tmp_thd->start_time;
-    return my_time(0);
+    {
+      when= tmp_thd->start_time;
+      when_sec_part= tmp_thd->start_time_sec_part;
+      return when;
+    }
+    my_hrtime_t hrtime= my_hrtime();
+    when= hrtime_to_my_time(hrtime);
+    when_sec_part= hrtime_sec_part(hrtime);
+    return when;
   }
 #endif
   virtual Log_event_type get_type_code() = 0;
@@ -1115,7 +1175,7 @@ public:
   static Log_event* read_log_event(const char* buf, uint event_len,
 				   const char **error,
                                    const Format_description_log_event
-                                   *description_event);
+                                   *description_event, my_bool crc_check);
   /**
     Returns the human readable name of the given event type.
   */
@@ -2305,9 +2365,17 @@ public:
   */
   uint8 common_header_len;
   uint8 number_of_event_types;
-  /* The list of post-headers' lengthes */
+  /* 
+     The list of post-headers' lengths followed 
+     by the checksum alg decription byte
+  */
   uint8 *post_header_len;
-  uchar server_version_split[3];
+  struct master_version_split {
+    enum {KIND_MYSQL, KIND_MARIADB};
+    int kind;
+    uchar ver[3];
+  };
+  master_version_split server_version_split;
   const uint8 *event_type_permutation;
 
   Format_description_log_event(uint8 binlog_ver, const char* server_ver=0);
@@ -2339,7 +2407,7 @@ public:
   }
 
   void calc_server_version_split();
-
+  static bool is_version_before_checksum(const master_version_split *version_split);
 protected:
 #if defined(MYSQL_SERVER) && defined(HAVE_REPLICATION)
   virtual int do_apply_event(Relay_log_info const *rli);
@@ -2516,7 +2584,12 @@ class Xid_log_event: public Log_event
    my_xid xid;
 
 #ifdef MYSQL_SERVER
-  Xid_log_event(THD* thd_arg, my_xid x): Log_event(thd_arg, 0, TRUE), xid(x) {}
+  Xid_log_event(THD* thd_arg, my_xid x, bool direct):
+   Log_event(thd_arg, 0, TRUE), xid(x)
+   {
+     if (direct)
+       cache_type= Log_event::EVENT_NO_CACHE;
+   }
 #ifdef HAVE_REPLICATION
   void pack_info(Protocol* protocol);
 #endif /* HAVE_REPLICATION */
@@ -3070,6 +3143,59 @@ public:
 char *str_to_hex(char *to, const char *from, uint len);
 
 /**
+  @class Annotate_rows_log_event
+
+  In row-based mode, if binlog_annotate_rows_events = ON, each group of
+  Table_map_log_events is preceded by an Annotate_rows_log_event which
+  contains the query which caused the subsequent rows operations.
+
+  The Annotate_rows_log_event has no post-header and its body contains
+  the corresponding query (without trailing zero). Note. The query length
+  is to be calculated as a difference between the whole event length and
+  the common header length.
+*/
+class Annotate_rows_log_event: public Log_event
+{
+public:
+#ifndef MYSQL_CLIENT
+  Annotate_rows_log_event(THD*, uint16 cache_type_arg);
+#endif
+  Annotate_rows_log_event(const char *buf, uint event_len,
+                          const Format_description_log_event*);
+  ~Annotate_rows_log_event();
+
+  virtual int get_data_size();
+  virtual Log_event_type get_type_code();
+  virtual bool is_valid() const;
+
+#ifndef MYSQL_CLIENT
+  virtual bool write_data_header(IO_CACHE*);
+  virtual bool write_data_body(IO_CACHE*);
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+  virtual void pack_info(Protocol*);
+#endif
+
+#ifdef MYSQL_CLIENT
+  virtual void print(FILE*, PRINT_EVENT_INFO*);
+#endif
+
+#if !defined(MYSQL_CLIENT) && defined(HAVE_REPLICATION)
+private:
+  virtual int do_apply_event(Relay_log_info const*);
+  virtual int do_update_pos(Relay_log_info*);
+  virtual enum_skip_reason do_shall_skip(Relay_log_info*);
+#endif
+
+private:
+  char *m_query_txt;
+  uint  m_query_len;
+  char *m_save_thd_query_txt;
+  uint  m_save_thd_query_len;
+};
+
+/**
   @class Table_map_log_event
 
   In row-based mode, every row operation event is preceded by a
@@ -3674,7 +3800,10 @@ protected:
   const uchar *m_curr_row;     /* Start of the row being processed */
   const uchar *m_curr_row_end; /* One-after the end of the current row */
   uchar    *m_key;      /* Buffer to keep key value during searches */
+  KEY      *m_key_info; /* Pointer to KEY info for m_key_nr */
+  uint      m_key_nr;   /* Key number */
 
+  int find_key(); // Find a best key to use in find_row()
   int find_row(const Relay_log_info *const);
   int write_row(const Relay_log_info *const, const bool);
 
@@ -4100,6 +4229,10 @@ bool rpl_get_position_info(const char **log_file_name, ulonglong *log_pos,
                            const char **group_relay_log_name,
                            ulonglong *relay_log_pos);
 
+bool event_checksum_test(uchar *buf, ulong event_len, uint8 alg);
+uint8 get_checksum_alg(const char* buf, ulong len);
+extern TYPELIB binlog_checksum_typelib;
+
 /**
   @} (end of group Replication)
 */
diff --git a/sql/log_event_old.cc b/sql/log_event_old.cc
index c8a5bf01a74..c6efa91f375 100644
--- a/sql/log_event_old.cc
+++ b/sql/log_event_old.cc
@@ -149,7 +149,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
       const_cast<Relay_log_info*>(rli)->m_table_map.set_table(ptr->table_id, ptr->table);
     }
 #ifdef HAVE_QUERY_CACHE
-    query_cache.invalidate_locked_for_write(rli->tables_to_lock);
+    query_cache.invalidate_locked_for_write(thd, rli->tables_to_lock);
 #endif
   }
 
@@ -171,7 +171,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
       TIMESTAMP column to a table with one.
       So we call set_time(), like in SBR. Presently it changes nothing.
     */
-    ev_thd->set_time((time_t)ev->when);
+    ev_thd->set_time(ev->when, ev->when_sec_part);
     /*
       There are a few flags that are replicated with each row event.
       Make sure to set/clear them before executing the main body of
@@ -1564,7 +1564,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
       const_cast<Relay_log_info*>(rli)->m_table_map.set_table(ptr->table_id, ptr->table);
     }
 #ifdef HAVE_QUERY_CACHE
-    query_cache.invalidate_locked_for_write(rli->tables_to_lock);
+    query_cache.invalidate_locked_for_write(thd, rli->tables_to_lock);
 #endif
   }
 
@@ -1588,7 +1588,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
       TIMESTAMP column to a table with one.
       So we call set_time(), like in SBR. Presently it changes nothing.
     */
-    thd->set_time((time_t)when);
+    thd->set_time(when, when_sec_part);
     /*
       There are a few flags that are replicated with each row event.
       Make sure to set/clear them before executing the main body of
diff --git a/sql/log_event_old.h b/sql/log_event_old.h
index 719802a80fb..da5cf403fdb 100644
--- a/sql/log_event_old.h
+++ b/sql/log_event_old.h
@@ -1,4 +1,4 @@
-/* Copyright 2007 MySQL AB. All rights reserved.
+/* Copyright 2007 MySQL AB.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index b338bc147bc..130ab676e7b 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1,5 +1,22 @@
+/* Copyright (C) 2010, 2011 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
 #include "sql_parse.h"
+#include <my_bit.h>
 #include "sql_select.h"
+#include "key.h"
 
 /****************************************************************************
  * Default MRR implementation (MRR to non-MRR converter)
@@ -136,10 +153,16 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
 */
 
 ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
-                                       uint *bufsz, uint *flags, COST_VECT *cost)
+                                       uint key_parts, uint *bufsz, 
+                                       uint *flags, COST_VECT *cost)
 {
-  *bufsz= 0; /* Default implementation doesn't need a buffer */
+  /* 
+    Currently we expect this function to be called only in preparation of scan
+    with HA_MRR_SINGLE_POINT property.
+  */
+  DBUG_ASSERT(*flags | HA_MRR_SINGLE_POINT);
 
+  *bufsz= 0; /* Default implementation doesn't need a buffer */
   *flags |= HA_MRR_USE_DEFAULT_IMPL;
 
   cost->zero();
@@ -207,7 +230,6 @@ handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
   DBUG_RETURN(0);
 }
 
-
 /**
   Get next record in MRR scan
 
@@ -221,10 +243,10 @@ handler::multi_range_read_init(RANGE_SEQ_IF *seq_funcs, void *seq_init_param,
   @retval other  Error code
 */
 
-int handler::multi_range_read_next(char **range_info)
+int handler::multi_range_read_next(range_id_t *range_info)
 {
-  int UNINIT_VAR(result);
-  int range_res;
+  int result= HA_ERR_END_OF_FILE;
+  bool range_res;
   DBUG_ENTER("handler::multi_range_read_next");
 
   if (!mrr_have_range)
@@ -246,7 +268,14 @@ int handler::multi_range_read_next(char **range_info)
     else
     {
       if (was_semi_consistent_read())
+      {
+        /*
+          The following assignment is redundant, but for extra safety and to
+          remove the compiler warning:
+        */
+        range_res= FALSE;
         goto scan_it_again;
+      }
       /*
         We need to set this for the last range only, but checking this
         condition is more expensive than just setting the result code.
@@ -277,7 +306,459 @@ scan_it_again:
 }
 
 /****************************************************************************
- * DS-MRR implementation 
+ * Mrr_*_reader classes (building blocks for DS-MRR)
+ ***************************************************************************/
+
+int Mrr_simple_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                                  void *seq_init_param, uint n_ranges,
+                                  uint mode,  Key_parameters *key_par_arg,
+                                  Lifo_buffer *key_buffer_arg,
+                                  Buffer_manager *buf_manager_arg)
+{
+  HANDLER_BUFFER no_buffer = {NULL, NULL, NULL};
+  file= h_arg;
+  return file->handler::multi_range_read_init(seq_funcs, seq_init_param,
+                                              n_ranges, mode, &no_buffer);
+}
+
+
+int Mrr_simple_index_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  while (!(res= file->handler::multi_range_read_next(range_info)))
+  {
+    KEY_MULTI_RANGE *curr_range= &file->handler::mrr_cur_range;
+    if (!file->mrr_funcs.skip_index_tuple ||
+        !file->mrr_funcs.skip_index_tuple(file->mrr_iter, curr_range->ptr))
+      break;
+  }
+  if (res && res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND)
+    file->print_error(res, MYF(0));             // Fatal error
+  return res;
+}
+
+
+/**
+  @brief Get next index record
+
+  @param range_info  OUT identifier of range that the returned record belongs to
+  
+  @note
+    We actually iterate over nested sequences:
+    - an ordered sequence of groups of identical keys
+      - each key group has key value, which has multiple matching records 
+        - thus, each record matches all members of the key group
+
+  @retval 0                   OK, next record was successfully read
+  @retval HA_ERR_END_OF_FILE  End of records
+  @retval Other               Some other error; Error is printed
+*/
+
+int Mrr_ordered_index_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  DBUG_ENTER("Mrr_ordered_index_reader::get_next");
+  
+  for(;;)
+  {
+    if (!scanning_key_val_iter)
+    {
+      while ((res= kv_it.init(this)))
+      {
+        if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE))
+          DBUG_RETURN(res); /* Some fatal error */
+
+        if (key_buffer->is_empty())
+        {
+          DBUG_RETURN(HA_ERR_END_OF_FILE);
+        }
+      }
+      scanning_key_val_iter= TRUE;
+    }
+
+    if ((res= kv_it.get_next(range_info)))
+    {
+      scanning_key_val_iter= FALSE;
+      if ((res != HA_ERR_KEY_NOT_FOUND && res != HA_ERR_END_OF_FILE))
+        DBUG_RETURN(res);
+      kv_it.move_to_next_key_value();
+      continue;
+    }
+    if (!skip_index_tuple(*range_info) &&
+        !skip_record(*range_info, NULL))
+    {
+      break;
+    }
+    /* Go get another (record, range_id) combination */
+  } /* while */
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Supply index reader with the O(1)space it needs for scan interrupt/restore
+  operation
+*/
+
+bool Mrr_ordered_index_reader::set_interruption_temp_buffer(uint rowid_length,
+                                                            uint key_len, 
+                                                            uint saved_pk_len,
+                                                            uchar **space_start,
+                                                            uchar *space_end)
+{
+  if (space_end - *space_start <= (ptrdiff_t)(rowid_length + key_len + saved_pk_len))
+    return TRUE;
+  support_scan_interruptions= TRUE; 
+  
+  saved_rowid= *space_start;
+  *space_start += rowid_length;
+  
+  if (saved_pk_len)
+  {
+    saved_primary_key= *space_start;
+    *space_start += saved_pk_len;
+  }
+  else
+    saved_primary_key= NULL;
+
+  saved_key_tuple= *space_start;
+  *space_start += key_len;
+
+  have_saved_rowid= FALSE;
+  return FALSE;
+}
+
+void Mrr_ordered_index_reader::set_no_interruption_temp_buffer()
+{
+  support_scan_interruptions= FALSE;
+  saved_key_tuple= saved_rowid= saved_primary_key= NULL; /* safety */
+  have_saved_rowid= FALSE;
+}
+
+void Mrr_ordered_index_reader::interrupt_read()
+{
+  DBUG_ASSERT(support_scan_interruptions);
+  TABLE *table= file->get_table();
+  /* Save the current key value */
+  key_copy(saved_key_tuple, table->record[0],
+           &table->key_info[file->active_index],
+           keypar.key_tuple_length);
+  
+  if (saved_primary_key)
+  {
+    key_copy(saved_primary_key, table->record[0], 
+             &table->key_info[table->s->primary_key],
+             table->key_info[table->s->primary_key].key_length);
+  }
+
+  /* Save the last rowid */
+  memcpy(saved_rowid, file->ref, file->ref_length);
+  have_saved_rowid= TRUE;
+}
+
+void Mrr_ordered_index_reader::position()
+{
+  if (have_saved_rowid)
+    memcpy(file->ref, saved_rowid, file->ref_length);
+  else
+    Mrr_index_reader::position();
+}
+
+void Mrr_ordered_index_reader::resume_read()
+{
+  TABLE *table= file->get_table();
+  key_restore(table->record[0], saved_key_tuple, 
+              &table->key_info[file->active_index],
+              keypar.key_tuple_length);
+  if (saved_primary_key)
+  {
+    key_restore(table->record[0], saved_primary_key, 
+                &table->key_info[table->s->primary_key],
+                table->key_info[table->s->primary_key].key_length);
+  }
+}
+
+
+/**
+  Fill the buffer with (lookup_tuple, range_id) pairs and sort
+*/
+
+int Mrr_ordered_index_reader::refill_buffer(bool initial)
+{
+  KEY_MULTI_RANGE cur_range;
+  DBUG_ENTER("Mrr_ordered_index_reader::refill_buffer");
+
+  DBUG_ASSERT(key_buffer->is_empty());
+
+  if (source_exhausted)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  buf_manager->reset_buffer_sizes(buf_manager->arg);
+  key_buffer->reset();
+  key_buffer->setup_writing(keypar.key_size_in_keybuf,
+                            is_mrr_assoc? sizeof(range_id_t) : 0);
+
+  while (key_buffer->can_write() && 
+         !(source_exhausted= mrr_funcs.next(mrr_iter, &cur_range)))
+  {
+    DBUG_ASSERT(cur_range.range_flag & EQ_RANGE);
+
+    /* Put key, or {key, range_id} pair into the buffer */
+    key_buffer->write_ptr1= keypar.use_key_pointers ?
+                              (uchar*)&cur_range.start_key.key : 
+                              (uchar*)cur_range.start_key.key;
+    key_buffer->write_ptr2= (uchar*)&cur_range.ptr;
+    key_buffer->write();
+  }
+  
+  /* Force get_next() to start with kv_it.init() call: */
+  scanning_key_val_iter= FALSE;
+
+  if (source_exhausted && key_buffer->is_empty())
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  key_buffer->sort((key_buffer->type() == Lifo_buffer::FORWARD)? 
+                     (qsort2_cmp)Mrr_ordered_index_reader::compare_keys_reverse : 
+                     (qsort2_cmp)Mrr_ordered_index_reader::compare_keys, 
+                   this);
+  DBUG_RETURN(0);
+}
+
+
+int Mrr_ordered_index_reader::init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
+                                   void *seq_init_param, uint n_ranges,
+                                   uint mode, Key_parameters *key_par_arg,
+                                   Lifo_buffer *key_buffer_arg,
+                                   Buffer_manager *buf_manager_arg)
+{
+  file= h_arg;
+  key_buffer= key_buffer_arg;
+  buf_manager= buf_manager_arg;
+  keypar= *key_par_arg;
+
+  KEY *key_info= &file->get_table()->key_info[file->active_index];
+  keypar.index_ranges_unique= test(key_info->flags & HA_NOSAME && 
+                                   key_info->key_parts == 
+                                   my_count_bits(keypar.key_tuple_map));
+
+  mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode);
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+  mrr_funcs= *seq_funcs;
+  source_exhausted= FALSE;
+  if (support_scan_interruptions)
+    bzero(saved_key_tuple, keypar.key_tuple_length);
+  have_saved_rowid= FALSE;
+  return 0;
+}
+
+
+static int rowid_cmp_reverse(void *file, uchar *a, uchar *b)
+{
+  return - ((handler*)file)->cmp_ref(a, b);
+}
+
+
+int Mrr_ordered_rndpos_reader::init(handler *h_arg, 
+                                    Mrr_index_reader *index_reader_arg,
+                                    uint mode,
+                                    Lifo_buffer *buf)
+{
+  file= h_arg;
+  index_reader= index_reader_arg;
+  rowid_buffer= buf;
+  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+  index_reader_exhausted= FALSE;
+  index_reader_needs_refill= TRUE;
+  return 0;
+}
+
+
+/**
+  DS-MRR: Fill and sort the rowid buffer
+
+  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
+  buffer. When the buffer is full or scan is completed, sort the buffer by 
+  rowid and return.
+
+  When this function returns, either rowid buffer is not empty, or the source
+  of lookup keys (i.e. ranges) is exhaused.
+  
+  @retval 0      OK, the next portion of rowids is in the buffer,
+                 properly ordered
+  @retval other  Error
+*/
+
+int Mrr_ordered_rndpos_reader::refill_buffer(bool initial)
+{
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_buffer");
+
+  if (index_reader_exhausted)
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  while (initial || index_reader_needs_refill || 
+         (res= refill_from_index_reader()) == HA_ERR_END_OF_FILE)
+  {
+    if ((res= index_reader->refill_buffer(initial)))
+    {
+      if (res == HA_ERR_END_OF_FILE)
+        index_reader_exhausted= TRUE;
+      break;
+    }
+    initial= FALSE;
+    index_reader_needs_refill= FALSE;
+  }
+  DBUG_RETURN(res);
+}
+
+
+void Mrr_index_reader::position()
+{
+  file->position(file->get_table()->record[0]);
+}
+
+
+/* 
+  @brief Try to refill the rowid buffer without calling
+  index_reader->refill_buffer(). 
+*/
+
+int Mrr_ordered_rndpos_reader::refill_from_index_reader()
+{
+  range_id_t range_info;
+  int res;
+  DBUG_ENTER("Mrr_ordered_rndpos_reader::refill_from_index_reader");
+
+  DBUG_ASSERT(rowid_buffer->is_empty());
+  index_rowid= index_reader->get_rowid_ptr();
+  rowid_buffer->reset();
+  rowid_buffer->setup_writing(file->ref_length,
+                              is_mrr_assoc? sizeof(range_id_t) : 0);
+
+  last_identical_rowid= NULL;
+
+  index_reader->resume_read();
+  while (rowid_buffer->can_write())
+  {
+    res= index_reader->get_next(&range_info);
+
+    if (res)
+    {
+      if (res != HA_ERR_END_OF_FILE)
+        DBUG_RETURN(res);
+      index_reader_needs_refill=TRUE;
+      break;
+    }
+
+    index_reader->position();
+
+    /* Put rowid, or {rowid, range_id} pair into the buffer */
+    rowid_buffer->write_ptr1= index_rowid;
+    rowid_buffer->write_ptr2= (uchar*)&range_info;
+    rowid_buffer->write();
+  }
+   
+  index_reader->interrupt_read();
+  /* Sort the buffer contents by rowid */
+  rowid_buffer->sort((qsort2_cmp)rowid_cmp_reverse, (void*)file);
+
+  rowid_buffer->setup_reading(file->ref_length,
+                              is_mrr_assoc ? sizeof(range_id_t) : 0);
+  DBUG_RETURN(rowid_buffer->is_empty()? HA_ERR_END_OF_FILE : 0);
+}
+
+
+/*
+  Get the next {record, range_id} using ordered array of rowid+range_id pairs
+
+  @note
+    Since we have sorted rowids, we try not to make multiple rnd_pos() calls
+    with the same rowid value.
+*/
+
+int Mrr_ordered_rndpos_reader::get_next(range_id_t *range_info)
+{
+  int res;
+  
+  /* 
+    First, check if rowid buffer has elements with the same rowid value as
+    the previous.
+  */
+  while (last_identical_rowid)
+  {
+    /*
+      Current record (the one we've returned in previous call) was obtained
+      from a rowid that matched multiple range_ids. Return this record again,
+      with next matching range_id.
+    */
+    (void)rowid_buffer->read();
+
+    if (rowid_buffer->read_ptr1 == last_identical_rowid)
+      last_identical_rowid= NULL; /* reached the last of identical rowids */
+
+    if (!is_mrr_assoc)
+      return 0;
+
+    memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t));
+    if (!index_reader->skip_record(*range_info, rowid_buffer->read_ptr1))
+      return 0;
+  }
+  
+  /* 
+     Ok, last_identical_rowid==NULL, it's time to read next different rowid
+     value and get record for it.
+  */
+  for(;;)
+  {
+    /* Return eof if there are no rowids in the buffer after re-fill attempt */
+    if (rowid_buffer->read())
+      return HA_ERR_END_OF_FILE;
+
+    if (is_mrr_assoc)
+    {
+      memcpy(range_info, rowid_buffer->read_ptr2, sizeof(range_id_t));
+      if (index_reader->skip_record(*range_info, rowid_buffer->read_ptr1))
+        continue;
+    }
+
+    res= file->ha_rnd_pos(file->get_table()->record[0], 
+                          rowid_buffer->read_ptr1);
+
+    if (res == HA_ERR_RECORD_DELETED)
+    {
+      /* not likely to get this code with current storage engines, but still */
+      continue;
+    }
+
+    if (res)
+      return res; /* Some fatal error */
+
+    break; /* Got another record */
+  }
+
+  /* 
+    Check if subsequent buffer elements have the same rowid value as this
+    one. If yes, remember this fact so that we don't make any more rnd_pos()
+    calls with this value.
+
+    Note: this implies that SQL layer doesn't touch table->record[0]
+    between calls.
+  */
+  Lifo_buffer_iterator it;
+  it.init(rowid_buffer);
+  while (!it.read())
+  {
+    if (file->cmp_ref(it.read_ptr1, rowid_buffer->read_ptr1))
+      break;
+    last_identical_rowid= it.read_ptr1;
+  }
+  return 0;
+}
+
+
+/****************************************************************************
+ * Top-level DS-MRR implementation functions (the ones called by storage engine)
  ***************************************************************************/
 
 /**
@@ -286,7 +767,7 @@ scan_it_again:
   Initialize and start the MRR scan. Depending on the mode parameter, this
   may use default or DS-MRR implementation.
 
-  @param h               Table handler to be used
+  @param h_arg           Table handler to be used
   @param key             Index to be used
   @param seq_funcs       Interval sequence enumeration functions
   @param seq_init_param  Interval sequence enumeration parameter
@@ -302,279 +783,591 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
                            void *seq_init_param, uint n_ranges, uint mode,
                            HANDLER_BUFFER *buf)
 {
-  uint elem_size;
-  Item *pushed_cond= NULL;
-  handler *new_h2= 0;
+  THD *thd= current_thd;
+  int res;
+  Key_parameters keypar;
+  uint key_buff_elem_size;
+  handler *h_idx;
+  Mrr_ordered_rndpos_reader *disk_strategy= NULL;
+  bool do_sort_keys= FALSE;
   DBUG_ENTER("DsMrr_impl::dsmrr_init");
-
+  LINT_INIT(key_buff_elem_size); /* set/used when do_sort_keys==TRUE */
   /*
     index_merge may invoke a scan on an object for which dsmrr_info[_const]
     has not been called, so set the owner handler here as well.
   */
-  h= h_arg;
-  if (mode & HA_MRR_USE_DEFAULT_IMPL || mode & HA_MRR_SORTED)
+  primary_file= h_arg;
+  is_mrr_assoc=    !test(mode & HA_MRR_NO_ASSOCIATION);
+
+  strategy_exhausted= FALSE;
+  
+  /* By default, have do-nothing buffer manager */
+  buf_manager.arg= this;
+  buf_manager.reset_buffer_sizes= do_nothing;
+  buf_manager.redistribute_buffer_space= do_nothing;
+
+  if (mode & (HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED))
+    goto use_default_impl;
+  
+  /*
+    Determine whether we'll need to do key sorting and/or rnd_pos() scan
+  */
+  index_strategy= NULL;
+  if ((mode & HA_MRR_SINGLE_POINT) &&
+      optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
   {
-    use_default_impl= TRUE;
-    const int retval=
-      h->handler::multi_range_read_init(seq_funcs, seq_init_param,
-                                        n_ranges, mode, buf);
-    DBUG_RETURN(retval);
+    do_sort_keys= TRUE;
+    index_strategy= &reader_factory.ordered_index_reader;
   }
-  rowids_buf= buf->buffer;
+  else
+    index_strategy= &reader_factory.simple_index_reader;
 
-  is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION);
+  strategy= index_strategy;
+  /*
+    We don't need a rowid-to-rndpos step if
+     - We're doing a scan on clustered primary key
+     - [In the future] We're doing an index_only read
+  */
+  DBUG_ASSERT(primary_file->inited == handler::INDEX || 
+              (primary_file->inited == handler::RND && 
+               secondary_file && 
+               secondary_file->inited == handler::INDEX));
+
+  h_idx= (primary_file->inited == handler::INDEX)? primary_file: secondary_file;
+  keyno= h_idx->active_index;
+
+  if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered()))
+  {
+    strategy= disk_strategy= &reader_factory.ordered_rndpos_reader;
+  }
 
   if (is_mrr_assoc)
-    status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count);
- 
-  rowids_buf_end= buf->buffer_end;
-  elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  rowids_buf_last= rowids_buf + 
-                      ((rowids_buf_end - rowids_buf)/ elem_size)*
-                      elem_size;
-  rowids_buf_end= rowids_buf_last;
+    status_var_increment(thd->status_var.ha_multi_range_read_init_count);
 
-    /*
-    There can be two cases:
-    - This is the first call since index_init(), h2==NULL
-       Need to setup h2 then.
-    - This is not the first call, h2 is initalized and set up appropriately.
-       The caller might have called h->index_init(), need to switch h to
-       rnd_pos calls.
+  full_buf= buf->buffer;
+  full_buf_end= buf->buffer_end;
+
+  if (do_sort_keys)
+  {
+    /* Pre-calculate some parameters of key sorting */
+    keypar.use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS);
+    seq_funcs->get_key_info(seq_init_param, &keypar.key_tuple_length, 
+                            &keypar.key_tuple_map);
+    keypar.key_size_in_keybuf= keypar.use_key_pointers? 
+                                 sizeof(char*) : keypar.key_tuple_length;
+    key_buff_elem_size= keypar.key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*);
+    
+    /* Ordered index reader needs some space to store an index tuple */
+    if (strategy != index_strategy)
+    {
+      uint saved_pk_length=0;
+      if (h_idx->primary_key_is_clustered())
+      {
+        uint pk= h_idx->get_table()->s->primary_key;
+        saved_pk_length= h_idx->get_table()->key_info[pk].key_length;
+      }
+
+      if (reader_factory.ordered_index_reader.
+            set_interruption_temp_buffer(primary_file->ref_length,
+                                         keypar.key_tuple_length,
+                                         saved_pk_length,
+                                         &full_buf, full_buf_end))
+        goto use_default_impl;
+    }
+    else
+      reader_factory.ordered_index_reader.set_no_interruption_temp_buffer();
+  }
+
+  if (strategy == index_strategy)
+  {
+    /* 
+      Index strategy alone handles the record retrieval. Give all buffer space
+      to it. Key buffer should have forward orientation so we can return the
+      end of it.
+    */
+    key_buffer= &forward_key_buf;
+    key_buffer->set_buffer_space(full_buf, full_buf_end);
+    
+    /* Safety: specify that rowid buffer has zero size: */
+    rowid_buffer.set_buffer_space(full_buf_end, full_buf_end);
+
+    if (do_sort_keys && !key_buffer->have_space_for(key_buff_elem_size))
+      goto use_default_impl;
+
+    if ((res= index_strategy->init(primary_file, seq_funcs, seq_init_param, n_ranges,
+                                   mode, &keypar, key_buffer, &buf_manager)))
+      goto error;
+  }
+  else
+  {
+    /* We'll have both index and rndpos strategies working together */
+    if (do_sort_keys)
+    {
+      /* Both strategies will need buffer space, share the buffer */
+      if (setup_buffer_sharing(keypar.key_size_in_keybuf, keypar.key_tuple_map))
+        goto use_default_impl;
+
+      buf_manager.reset_buffer_sizes= reset_buffer_sizes;
+      buf_manager.redistribute_buffer_space= redistribute_buffer_space;
+    }
+    else
+    {
+      /* index strategy doesn't need buffer, give all space to rowids*/
+      rowid_buffer.set_buffer_space(full_buf, full_buf_end);
+      if (!rowid_buffer.have_space_for(primary_file->ref_length + 
+                                       (int)is_mrr_assoc * sizeof(range_id_t)))
+        goto use_default_impl;
+    }
+
+    if ((res= setup_two_handlers()))
+      goto error;
+
+    if ((res= index_strategy->init(secondary_file, seq_funcs, seq_init_param,
+                                   n_ranges, mode, &keypar, key_buffer, 
+                                   &buf_manager)) || 
+        (res= disk_strategy->init(primary_file, index_strategy, mode, 
+                                  &rowid_buffer)))
+    {
+      goto error;
+    }
+  }
+
+  res= strategy->refill_buffer(TRUE);
+  if (res)
+  {
+    if (res != HA_ERR_END_OF_FILE)
+      goto error;
+    strategy_exhausted= TRUE;
+  }
+
+  /*
+    If we have scanned through all intervals in *seq, then adjust *buf to 
+    indicate that the remaining buffer space will not be used.
   */
-  if (!h2)
+//  if (dsmrr_eof) 
+//    buf->end_of_used_area= rowid_buffer.end_of_space();
+
+  
+  DBUG_RETURN(0);
+error:
+  close_second_handler();
+   /* Safety, not really needed but: */
+  strategy= NULL;
+  DBUG_RETURN(res);
+
+use_default_impl:
+  if (primary_file->inited != handler::INDEX)
   {
-    /* Create a separate handler object to do rndpos() calls. */
-    THD *thd= current_thd;
+    /* We can get here when 
+       - we've previously successfully done a DS-MRR scan (and so have 
+         secondary_file!= NULL, secondary_file->inited= INDEX, 
+         primary_file->inited=RND)
+       - for this invocation, we haven't got enough buffer space, and so we
+         have to use the default MRR implementation.
+
+      note: primary_file->ha_index_end() will call dsmrr_close() which will
+      close/destroy the secondary_file, this is intentional. 
+      (Yes this is slow, but one can't expect performance with join buffer 
+       so small that it can accomodate one rowid and one index tuple)
+    */
+    if ((res= primary_file->ha_rnd_end()) || 
+        (res= primary_file->ha_index_init(keyno, test(mode & HA_MRR_SORTED))))
+    {
+      DBUG_RETURN(res);
+    }
+  }
+  /* Call correct init function and assign to top level object */
+  Mrr_simple_index_reader *s= &reader_factory.simple_index_reader;
+  res= s->init(primary_file, seq_funcs, seq_init_param, n_ranges, mode, NULL, 
+               NULL, NULL);
+  strategy= s;
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Whatever the current state is, make it so that we have two handler objects:
+  - primary_file       -  initialized for rnd_pos() scan
+  - secondary_file     -  initialized for scanning the index specified in
+                          this->keyno
+  RETURN 
+    0        OK
+    HA_XXX   Error code
+*/
+
+int DsMrr_impl::setup_two_handlers()
+{
+  int res;
+  THD *thd= primary_file->get_table()->in_use;
+  DBUG_ENTER("DsMrr_impl::setup_two_handlers");
+  if (!secondary_file)
+  {
+    handler *new_h2;
+    Item *pushed_cond= NULL;
+    DBUG_ASSERT(primary_file->inited == handler::INDEX);
+    /* Create a separate handler object to do rnd_pos() calls. */
     /*
       ::clone() takes up a lot of stack, especially on 64 bit platforms.
       The constant 5 is an empiric result.
     */
     if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2))
       DBUG_RETURN(1);
-    DBUG_ASSERT(h->active_index != MAX_KEY);
-    uint mrr_keyno= h->active_index;
 
-    /* Create a separate handler object to do rndpos() calls. */
-    if (!(new_h2= h->clone(h->table->s->normalized_path.str, thd->mem_root)) || 
+    /* Create a separate handler object to do rnd_pos() calls. */
+    if (!(new_h2= primary_file->clone(primary_file->get_table()->s->
+                                      normalized_path.str,
+                                      thd->mem_root)) || 
         new_h2->ha_external_lock(thd, F_RDLCK))
     {
       delete new_h2;
       DBUG_RETURN(1);
     }
 
-    if (mrr_keyno == h->pushed_idx_cond_keyno)
-      pushed_cond= h->pushed_idx_cond;
-
+    if (keyno == primary_file->pushed_idx_cond_keyno)
+      pushed_cond= primary_file->pushed_idx_cond;
+    
+    Mrr_reader *save_strategy= strategy;
+    strategy= NULL;
     /*
       Caution: this call will invoke this->dsmrr_close(). Do not put the
-      created secondary table handler into this->h2 or it will delete it.
+      created secondary table handler new_h2 into this->secondary_file or it 
+      will delete it. Also, save the picked strategy
     */
-    if (h->ha_index_end())
-    {
-      h2=new_h2;
+    res= primary_file->ha_index_end();
+
+    strategy= save_strategy;
+    secondary_file= new_h2;
+
+    if (res || (res= (primary_file->ha_rnd_init(FALSE))))
       goto error;
-    }
 
-    h2= new_h2; /* Ok, now can put it into h2 */
     table->prepare_for_position();
-    h2->extra(HA_EXTRA_KEYREAD);
-  
-    if (h2->ha_index_init(mrr_keyno, FALSE))
+    secondary_file->extra(HA_EXTRA_KEYREAD);
+    secondary_file->mrr_iter= primary_file->mrr_iter;
+
+    if ((res= secondary_file->ha_index_init(keyno, FALSE)))
       goto error;
 
-    use_default_impl= FALSE;
     if (pushed_cond)
-      h2->idx_cond_push(mrr_keyno, pushed_cond);
+      secondary_file->idx_cond_push(keyno, pushed_cond);
   }
   else
   {
+    DBUG_ASSERT(secondary_file && secondary_file->inited==handler::INDEX);
     /* 
       We get here when the access alternates betwen MRR scan(s) and non-MRR
       scans.
 
-      Calling h->index_end() will invoke dsmrr_close() for this object,
-      which will delete h2. We need to keep it, so save put it away and dont
+      Calling primary_file->index_end() will invoke dsmrr_close() for this object,
+      which will delete secondary_file. We need to keep it, so put it away and dont
       let it be deleted:
     */
-    handler *save_h2= h2;
-    h2= NULL;
-    int res= (h->inited == handler::INDEX && h->ha_index_end());
-    h2= save_h2;
-    use_default_impl= FALSE;
-    if (res)
+    if (primary_file->inited == handler::INDEX)
+    {
+      handler *save_h2= secondary_file;
+      Mrr_reader *save_strategy= strategy;
+      secondary_file= NULL;
+      strategy= NULL;
+      res= primary_file->ha_index_end();
+      secondary_file= save_h2;
+      strategy= save_strategy;
+      if (res)
+        goto error;
+    }
+    if ((primary_file->inited != handler::RND) && 
+        (res= primary_file->ha_rnd_init(FALSE)))
       goto error;
   }
+  DBUG_RETURN(0);
 
-  if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges,
-                                          mode, buf) || 
-      dsmrr_fill_buffer())
-  {
-    goto error;
-  }
-  /*
-    If the above call has scanned through all intervals in *seq, then
-    adjust *buf to indicate that the remaining buffer space will not be used.
-  */
-  if (dsmrr_eof) 
-    buf->end_of_used_area= rowids_buf_last;
+error:
+  DBUG_RETURN(res);
+}
 
-  /*
-     h->inited == INDEX may occur when 'range checked for each record' is
-     used.
-  */
-  if ((h->inited != handler::RND) && 
-      ((h->inited==handler::INDEX? h->ha_index_end(): FALSE) || 
-       (h->ha_rnd_init(FALSE))))
-      goto error;
 
-  use_default_impl= FALSE;
-  h->mrr_funcs= *seq_funcs;
-  
-  DBUG_RETURN(0);
-error:
-  h2->ha_index_or_rnd_end();
-  h2->ha_external_lock(current_thd, F_UNLCK);
-  h2->close();
-  delete h2;
-  h2= NULL;
-  DBUG_RETURN(1);
+void DsMrr_impl::close_second_handler()
+{
+  if (secondary_file)
+  {
+    secondary_file->ha_index_or_rnd_end();
+    secondary_file->ha_external_lock(current_thd, F_UNLCK);
+    secondary_file->ha_close();
+    delete secondary_file;
+    secondary_file= NULL;
+  }
 }
 
 
 void DsMrr_impl::dsmrr_close()
 {
   DBUG_ENTER("DsMrr_impl::dsmrr_close");
-  if (h2)
+  close_second_handler();
+  strategy= NULL;
+  DBUG_VOID_RETURN;
+}
+
+
+/* 
+  my_qsort2-compatible static member function to compare key tuples 
+*/
+
+int Mrr_ordered_index_reader::compare_keys(void* arg, uchar* key1_arg, 
+                                           uchar* key2_arg)
+{
+  Mrr_ordered_index_reader *reader= (Mrr_ordered_index_reader*)arg;
+  TABLE *table= reader->file->get_table();
+  KEY_PART_INFO *part= table->key_info[reader->file->active_index].key_part;
+  uchar *key1, *key2;
+   
+  if (reader->keypar.use_key_pointers)
   {
-    h2->ha_index_or_rnd_end();
-    h2->ha_external_lock(current_thd, F_UNLCK);
-    h2->close();
-    delete h2;
-    h2= NULL;
+    /* the buffer stores pointers to keys, get to the keys */
+    memcpy(&key1, key1_arg, sizeof(char*));
+    memcpy(&key2, key2_arg, sizeof(char*));
   }
-  use_default_impl= TRUE;
-  DBUG_VOID_RETURN;
+  else
+  {
+    key1= key1_arg;
+    key2= key2_arg;
+  }
+
+  return key_tuple_cmp(part, key1, key2, reader->keypar.key_tuple_length);
 }
 
 
-static int rowid_cmp(void *h, uchar *a, uchar *b)
+int Mrr_ordered_index_reader::compare_keys_reverse(void* arg, uchar* key1, 
+                                                   uchar* key2)
 {
-  return ((handler*)h)->cmp_ref(a, b);
+  return -compare_keys(arg, key1, key2);
 }
 
 
 /**
-  DS-MRR: Fill the buffer with rowids and sort it by rowid
+  Set the buffer space to be shared between rowid and key buffer
+
+  @return FALSE  ok 
+  @return TRUE   There is so little buffer space that we won't be able to use
+                 the strategy. 
+                 This happens when we don't have enough space for one rowid 
+                 element and one key element so this is mainly targeted at
+                 testing.
+*/
 
-  {This is an internal function of DiskSweep MRR implementation}
-  Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into 
-  buffer. When the buffer is full or scan is completed, sort the buffer by 
-  rowid and return.
+bool DsMrr_impl::setup_buffer_sharing(uint key_size_in_keybuf, 
+                                      key_part_map key_tuple_map)
+{
+  long key_buff_elem_size= key_size_in_keybuf + 
+                           (int)is_mrr_assoc * sizeof(range_id_t);
   
-  The function assumes that rowids buffer is empty when it is invoked. 
+  KEY *key_info= &primary_file->get_table()->key_info[keyno];
+  /* 
+    Ok if we got here we need to allocate one part of the buffer 
+    for keys and another part for rowids.
+  */
+  ulonglong rowid_buf_elem_size= primary_file->ref_length + 
+                                 (int)is_mrr_assoc * sizeof(range_id_t);
   
-  @param h  Table handler
+  /*
+    Use rec_per_key statistics as a basis to find out how many rowids 
+    we'll get for each key value.
+     TODO: what should be the default value to use when there is no 
+           statistics?
+  */
+  uint parts= my_count_bits(key_tuple_map);
+  ulong rpc;
+  ulonglong rowids_size= rowid_buf_elem_size;
+  if ((rpc= key_info->rec_per_key[parts - 1]))
+    rowids_size= rowid_buf_elem_size * rpc;
+
+  double fraction_for_rowids=
+    (ulonglong2double(rowids_size) / 
+     (ulonglong2double(rowids_size) + key_buff_elem_size));
+
+  ptrdiff_t bytes_for_rowids= 
+    (ptrdiff_t)floor(0.5 + fraction_for_rowids * (full_buf_end - full_buf));
+  
+  ptrdiff_t bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids;
 
-  @retval 0      OK, the next portion of rowids is in the buffer,
-                 properly ordered
-  @retval other  Error
-*/
+  if (bytes_for_keys < key_buff_elem_size + 1)
+  {
+    ptrdiff_t add= key_buff_elem_size + 1 - bytes_for_keys;
+    bytes_for_keys= key_buff_elem_size + 1;
+    bytes_for_rowids -= add;
+  }
+
+  if (bytes_for_rowids < (ptrdiff_t)rowid_buf_elem_size + 1)
+  {
+    ptrdiff_t add= (ptrdiff_t)(rowid_buf_elem_size + 1 - bytes_for_rowids);
+    bytes_for_rowids= (ptrdiff_t)rowid_buf_elem_size + 1;
+    bytes_for_keys -= add;
+  }
+
+  rowid_buffer_end= full_buf + bytes_for_rowids;
+  rowid_buffer.set_buffer_space(full_buf, rowid_buffer_end);
+  key_buffer= &backward_key_buf;
+  key_buffer->set_buffer_space(rowid_buffer_end, full_buf_end); 
 
-int DsMrr_impl::dsmrr_fill_buffer()
+  if (!key_buffer->have_space_for(key_buff_elem_size) ||
+      !rowid_buffer.have_space_for((size_t)rowid_buf_elem_size))
+    return TRUE; /* Failed to provide minimum space for one of the buffers */
+
+  return FALSE;
+}
+
+
+void DsMrr_impl::do_nothing(void *dsmrr_arg)
 {
-  char *range_info;
-  int res;
-  DBUG_ENTER("DsMrr_impl::dsmrr_fill_buffer");
+  /* Do nothing */
+}
 
-  rowids_buf_cur= rowids_buf;
-  while ((rowids_buf_cur < rowids_buf_end) && 
-         !(res= h2->handler::multi_range_read_next(&range_info)))
-  {
-    KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range;
-    if (h2->mrr_funcs.skip_index_tuple &&
-        h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr))
-      continue;
-    
-    /* Put rowid, or {rowid, range_id} pair into the buffer */
-    h2->position(table->record[0]);
-    memcpy(rowids_buf_cur, h2->ref, h2->ref_length);
-    rowids_buf_cur += h2->ref_length;
 
-    if (is_mrr_assoc)
-    {
-      memcpy(rowids_buf_cur, &range_info, sizeof(void*));
-      rowids_buf_cur += sizeof(void*);
-    }
-  }
+void DsMrr_impl::reset_buffer_sizes(void *dsmrr_arg)
+{
+  DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg;
+  dsmrr->rowid_buffer.set_buffer_space(dsmrr->full_buf, 
+                                       dsmrr->rowid_buffer_end);
+  dsmrr->key_buffer->set_buffer_space(dsmrr->rowid_buffer_end, 
+                                      dsmrr->full_buf_end);
+}
 
-  if (res && res != HA_ERR_END_OF_FILE)
-    DBUG_RETURN(res); 
-  dsmrr_eof= test(res == HA_ERR_END_OF_FILE);
 
-  /* Sort the buffer contents by rowid */
-  uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*);
-  uint n_rowids= (rowids_buf_cur - rowids_buf) / elem_size;
-  
-  my_qsort2(rowids_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp,
-            (void*)h);
-  rowids_buf_last= rowids_buf_cur;
-  rowids_buf_cur=  rowids_buf;
-  DBUG_RETURN(0);
+/*
+  Take unused space from the key buffer and give it to the rowid buffer
+*/
+
+void DsMrr_impl::redistribute_buffer_space(void *dsmrr_arg)
+{
+  DsMrr_impl *dsmrr= (DsMrr_impl*)dsmrr_arg;
+  uchar *unused_start, *unused_end;
+  dsmrr->key_buffer->remove_unused_space(&unused_start, &unused_end);
+  dsmrr->rowid_buffer.grow(unused_start, unused_end);
 }
 
 
-/**
-  DS-MRR implementation: multi_range_read_next() function
+/*
+  @brief Initialize the iterator
+  
+  @note
+  Initialize the iterator to produce matches for the key of the first element 
+  in owner_arg->key_buffer
+
+  @retval  0                    OK
+  @retval  HA_ERR_END_OF_FILE   Either the owner->key_buffer is empty or 
+                                no matches for the key we've tried (check
+                                key_buffer->is_empty() to tell these apart)
+  @retval  other code           Fatal error
 */
 
-int DsMrr_impl::dsmrr_next(char **range_info)
+int Key_value_records_iterator::init(Mrr_ordered_index_reader *owner_arg)
 {
   int res;
-  uchar *cur_range_info= 0;
-  uchar *rowid;
+  owner= owner_arg;
+
+  identical_key_it.init(owner->key_buffer);
+  owner->key_buffer->setup_reading(owner->keypar.key_size_in_keybuf,
+                                   owner->is_mrr_assoc ? sizeof(void*) : 0);
+
+  if (identical_key_it.read())
+    return HA_ERR_END_OF_FILE;
 
-  if (use_default_impl)
-    return h->handler::multi_range_read_next(range_info);
+  uchar *key_in_buf= last_identical_key_ptr= identical_key_it.read_ptr1;
+
+  uchar *index_tuple= key_in_buf;
+  if (owner->keypar.use_key_pointers)
+    memcpy(&index_tuple, key_in_buf, sizeof(char*));
   
-  do
+  /* Check out how many more identical keys are following */
+  while (!identical_key_it.read())
+  {
+    if (Mrr_ordered_index_reader::compare_keys(owner, key_in_buf, 
+                                               identical_key_it.read_ptr1))
+      break;
+    last_identical_key_ptr= identical_key_it.read_ptr1;
+  }
+  identical_key_it.init(owner->key_buffer);
+  res= owner->file->ha_index_read_map(owner->file->get_table()->record[0], 
+                                      index_tuple, 
+                                      owner->keypar.key_tuple_map, 
+                                      HA_READ_KEY_EXACT);
+
+  if (res)
+  {
+    /* Failed to find any matching records */
+    move_to_next_key_value();
+    return res;
+  }
+  owner->have_saved_rowid= FALSE;
+  get_next_row= FALSE;
+  return 0;
+}
+
+
+int Key_value_records_iterator::get_next(range_id_t *range_info)
+{
+  int res;
+
+  if (get_next_row)
   {
-    if (rowids_buf_cur == rowids_buf_last)
+    if (owner->keypar.index_ranges_unique)
     {
-      if (dsmrr_eof)
-      {
-        res= HA_ERR_END_OF_FILE;
-        goto end;
-      }
-      res= dsmrr_fill_buffer();
-      if (res)
-        goto end;
+      /* We're using a full unique key, no point to call index_next_same */
+      return HA_ERR_END_OF_FILE;
     }
-   
-    /* return eof if there are no rowids in the buffer after re-fill attempt */
-    if (rowids_buf_cur == rowids_buf_last)
+    
+    handler *h= owner->file;
+    if ((res= h->ha_index_next_same(h->get_table()->record[0], 
+                                    identical_key_it.read_ptr1, 
+                                    owner->keypar.key_tuple_length)))
     {
-      res= HA_ERR_END_OF_FILE;
-      goto end;
+      /* It's either HA_ERR_END_OF_FILE or some other error */
+      return res; 
     }
-    rowid= rowids_buf_cur;
+    identical_key_it.init(owner->key_buffer);
+    owner->have_saved_rowid= FALSE;
+    get_next_row= FALSE;
+  }
 
-    if (is_mrr_assoc)
-      memcpy(&cur_range_info, rowids_buf_cur + h->ref_length, sizeof(uchar**));
+  identical_key_it.read(); /* This gets us next range_id */
+  memcpy(range_info, identical_key_it.read_ptr2, sizeof(range_id_t));
 
-    rowids_buf_cur += h->ref_length + sizeof(void*) * test(is_mrr_assoc);
-    if (h2->mrr_funcs.skip_record &&
-	h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid))
-      continue;
-    res= h->ha_rnd_pos(table->record[0], rowid);
-    break;
-  } while (true);
- 
-  if (is_mrr_assoc)
+  if (!last_identical_key_ptr || 
+      (identical_key_it.read_ptr1 == last_identical_key_ptr))
   {
-    memcpy(range_info, rowid + h->ref_length, sizeof(void*));
+    /* 
+      We've reached the last of the identical keys that current record is a
+      match for.  Set get_next_row=TRUE so that we read the next index record
+      on the next call to this function.
+    */
+    get_next_row= TRUE;
+  }
+  return 0;
+}
+
+
+void Key_value_records_iterator::move_to_next_key_value()
+{
+  while (!owner->key_buffer->read() && 
+         (owner->key_buffer->read_ptr1 != last_identical_key_ptr)) {}
+}
+
+
+/**
+  DS-MRR implementation: multi_range_read_next() function.
+
+  Calling convention is like multi_range_read_next() has.
+*/
+
+int DsMrr_impl::dsmrr_next(range_id_t *range_info)
+{
+  int res;
+  if (strategy_exhausted)
+    return HA_ERR_END_OF_FILE;
+
+  while ((res= strategy->get_next(range_info)) == HA_ERR_END_OF_FILE)
+  {
+    if ((res= strategy->refill_buffer(FALSE)))
+      break; /* EOF or error */
   }
-end:
   return res;
 }
 
@@ -582,7 +1375,8 @@ end:
 /**
   DS-MRR implementation: multi_range_read_info() function
 */
-ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
+ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows, 
+                               uint key_parts,
                                uint *bufsz, uint *flags, COST_VECT *cost)
 {  
   ha_rows res;
@@ -590,12 +1384,13 @@ ha_rows DsMrr_impl::dsmrr_info(uint keyno, uint n_ranges, uint rows,
   uint def_bufsz= *bufsz;
 
   /* Get cost/flags/mem_usage of default MRR implementation */
-  res= h->handler::multi_range_read_info(keyno, n_ranges, rows, &def_bufsz,
-                                         &def_flags, cost);
+  res= primary_file->handler::multi_range_read_info(keyno, n_ranges, rows,
+                                                    key_parts, &def_bufsz, 
+                                                    &def_flags, cost);
   DBUG_ASSERT(!res);
 
   if ((*flags & HA_MRR_USE_DEFAULT_IMPL) || 
-      choose_mrr_impl(keyno, rows, &def_flags, &def_bufsz, cost))
+      choose_mrr_impl(keyno, rows, flags, bufsz, cost))
   {
     /* Default implementation is choosen */
     DBUG_PRINT("info", ("Default MRR implementation choosen"));
@@ -623,9 +1418,11 @@ ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
   uint def_flags= *flags;
   uint def_bufsz= *bufsz;
   /* Get cost/flags/mem_usage of default MRR implementation */
-  rows= h->handler::multi_range_read_info_const(keyno, seq, seq_init_param,
-                                                n_ranges, &def_bufsz, 
-                                                &def_flags, cost);
+  rows= primary_file->handler::multi_range_read_info_const(keyno, seq, 
+                                                           seq_init_param,
+                                                           n_ranges, 
+                                                           &def_bufsz, 
+                                                           &def_flags, cost);
   if (rows == HA_POS_ERROR)
   {
     /* Default implementation can't perform MRR scan => we can't either */
@@ -635,7 +1432,7 @@ ha_rows DsMrr_impl::dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq,
   /*
     If HA_MRR_USE_DEFAULT_IMPL has been passed to us, that is an order to
     use the default MRR implementation (we need it for UPDATE/DELETE).
-    Otherwise, make a choice based on cost and @@optimizer_use_mrr.
+    Otherwise, make a choice based on cost and @@optimizer_switch settings
   */
   if ((*flags & HA_MRR_USE_DEFAULT_IMPL) ||
       choose_mrr_impl(keyno, rows, flags, bufsz, cost))
@@ -683,7 +1480,28 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   return FALSE;
 }
 
-/**
+
+/*
+  Check if key/flags allow DS-MRR/CPK strategy to be used
+  
+  @param thd
+  @param keyno      Index that will be used
+  @param  mrr_flags  
+  
+  @retval TRUE   DS-MRR/CPK should be used
+  @retval FALSE  Otherwise
+*/
+
+bool DsMrr_impl::check_cpk_scan(THD *thd, uint keyno, uint mrr_flags)
+{
+  return test((mrr_flags & HA_MRR_SINGLE_POINT) &&
+              keyno == table->s->primary_key && 
+              primary_file->primary_key_is_clustered() && 
+              optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS));
+}
+
+
+/*
   DS-MRR Internals: Choose between Default MRR implementation and DS-MRR
 
   Make the choice between using Default MRR implementation and DS-MRR.
@@ -706,22 +1524,29 @@ bool key_uses_partial_cols(TABLE *table, uint keyno)
   @retval FALSE  DS-MRR implementation should be used
 */
 
+
 bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
                                  uint *bufsz, COST_VECT *cost)
 {
   COST_VECT dsmrr_cost;
   bool res;
   THD *thd= current_thd;
-  if (thd->variables.optimizer_use_mrr == 2 || *flags & HA_MRR_INDEX_ONLY ||
-      (keyno == table->s->primary_key && h->primary_key_is_clustered()) ||
-       key_uses_partial_cols(table, keyno))
+
+  bool doing_cpk_scan= check_cpk_scan(thd, keyno, *flags); 
+  bool using_cpk= test(keyno == table->s->primary_key &&
+                       primary_file->primary_key_is_clustered());
+  *flags &= ~HA_MRR_IMPLEMENTATION_FLAGS;
+  if (!optimizer_flag(thd, OPTIMIZER_SWITCH_MRR) ||
+      *flags & HA_MRR_INDEX_ONLY ||
+      (using_cpk && !doing_cpk_scan) || key_uses_partial_cols(table, keyno))
   {
     /* Use the default implementation */
     *flags |= HA_MRR_USE_DEFAULT_IMPL;
+    *flags &= ~HA_MRR_IMPLEMENTATION_FLAGS;
     return TRUE;
   }
-  
-  uint add_len= table->key_info[keyno].key_length + h->ref_length; 
+
+  uint add_len= table->key_info[keyno].key_length + primary_file->ref_length; 
   *bufsz -= add_len;
   if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
     return TRUE;
@@ -729,12 +1554,12 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   
   bool force_dsmrr;
   /* 
-    If @@optimizer_use_mrr==force, then set cost of DS-MRR to be minimum of
+    If mrr_cost_based flag is not set, then set cost of DS-MRR to be minimum of
     DS-MRR and Default implementations cost. This allows one to force use of
     DS-MRR whenever it is applicable without affecting other cost-based
     choices.
   */
-  if ((force_dsmrr= (thd->variables.optimizer_use_mrr == 1)) &&
+  if ((force_dsmrr= !optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_COST_BASED)) &&
       dsmrr_cost.total_cost() > cost->total_cost())
     dsmrr_cost= *cost;
 
@@ -744,6 +1569,25 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
     *flags &= ~HA_MRR_SORTED;          /* We will return unordered output */
     *cost= dsmrr_cost;
     res= FALSE;
+
+
+    if ((using_cpk && doing_cpk_scan) ||
+        (optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS) &&
+         *flags & HA_MRR_SINGLE_POINT))
+    {
+      *flags |= DSMRR_IMPL_SORT_KEYS;
+    }
+    
+    if (!(using_cpk && doing_cpk_scan) &&
+        !(*flags & HA_MRR_INDEX_ONLY))
+    {
+      *flags |= DSMRR_IMPL_SORT_ROWIDS;
+    }
+    /*
+    if ((*flags & HA_MRR_SINGLE_POINT) && 
+         optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS))
+      *flags |= HA_MRR_MATERIALIZED_KEYS;
+    */
   }
   else
   {
@@ -753,6 +1597,38 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
   return res;
 }
 
+/*
+  Take the flags we've returned previously and print one of
+  - Key-ordered scan
+  - Rowid-ordered scan
+  - Key-ordered Rowid-ordered scan
+*/
+
+int DsMrr_impl::dsmrr_explain_info(uint mrr_mode, char *str, size_t size)
+{
+  const char *key_ordered=   "Key-ordered scan";
+  const char *rowid_ordered= "Rowid-ordered scan";
+  const char *both_ordered=  "Key-ordered Rowid-ordered scan";
+  const char *used_str="";
+  const uint BOTH_FLAGS= (DSMRR_IMPL_SORT_KEYS | DSMRR_IMPL_SORT_ROWIDS);
+
+  if (!(mrr_mode & HA_MRR_USE_DEFAULT_IMPL))
+  {
+    if ((mrr_mode & BOTH_FLAGS) == BOTH_FLAGS)
+      used_str= both_ordered;
+    else if (mrr_mode & DSMRR_IMPL_SORT_KEYS)
+      used_str= key_ordered;
+    else if (mrr_mode & DSMRR_IMPL_SORT_ROWIDS)
+      used_str= rowid_ordered;
+
+    uint used_str_len= strlen(used_str);
+    uint copy_len= min(used_str_len, size);
+    memcpy(str, used_str, size);
+    return copy_len;
+  }
+  return 0;
+}
+
 
 static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, COST_VECT *cost);
 
@@ -779,7 +1655,8 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
   uint n_full_steps;
   double index_read_cost;
 
-  elem_size= h->ref_length + sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
+  elem_size= primary_file->ref_length + 
+             sizeof(void*) * (!test(flags & HA_MRR_NO_ASSOCIATION));
   max_buff_entries = *buffer_size / elem_size;
 
   if (!max_buff_entries)
@@ -807,7 +1684,7 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
     cost->zero();
     *buffer_size= max(*buffer_size, 
                       (size_t)(1.2*rows_in_last_step) * elem_size + 
-                      h->ref_length + table->key_info[keynr].key_length);
+                      primary_file->ref_length + table->key_info[keynr].key_length);
   }
   
   COST_VECT last_step_cost;
@@ -820,7 +1697,7 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
     cost->mem_cost= (double)rows_in_last_step * elem_size;
   
   /* Total cost of all index accesses */
-  index_read_cost= h->keyread_time(keynr, 1, (double)rows);
+  index_read_cost= primary_file->keyread_time(keynr, 1, rows);
   cost->add_io(index_read_cost, 1 /* Random seeks */);
   return FALSE;
 }
@@ -828,17 +1705,14 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
 
 /* 
   Get cost of one sort-and-sweep step
+  
+  It consists of two parts:
+   - sort an array of #nrows ROWIDs using qsort
+   - read #nrows records from table in a sweep.
 
-  SYNOPSIS
-    get_sort_and_sweep_cost()
-      table       Table being accessed
-      nrows       Number of rows to be sorted and retrieved
-      cost   OUT  The cost
-
-  DESCRIPTION
-    Get cost of these operations:
-     - sort an array of #nrows ROWIDs using qsort
-     - read #nrows records from table in a sweep.
+  @param table       Table being accessed
+  @param nrows       Number of rows to be sorted and retrieved
+  @param cost   OUT  The cost of scan
 */
 
 static 
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 90e2e4c93d6..1b72e71944d 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -1,70 +1,637 @@
-/*
-  This file contains declarations for 
-   - Disk-Sweep MultiRangeRead (DS-MRR) implementation
+/**
+  @defgroup DS-MRR declarations
+  @{
 */
 
 /**
-  A Disk-Sweep MRR interface implementation
+  A Disk-Sweep implementation of MRR Interface (DS-MRR for short)
+
+  This is a "plugin"(*) for storage engines that allows to
+    1. When doing index scans, read table rows in rowid order;
+    2. when making many index lookups, do them in key order and don't
+       lookup the same key value multiple times;
+    3. Do both #1 and #2, when applicable.
+  These changes are expected to speed up query execution for disk-based 
+  storage engines running io-bound loads and "big" queries (ie. queries that
+  do joins and enumerate lots of records).
+
+  (*) - only conceptually. No dynamic loading or binary compatibility of any
+        kind.
+
+  General scheme of things:
+   
+      SQL Layer code
+       |   |   |
+       v   v   v 
+      -|---|---|---- handler->multi_range_read_XXX() function calls
+       |   |   |
+      _____________________________________
+     / DS-MRR module                       \
+     | (order/de-duplicate lookup keys,    |
+     | scan indexes in key order,          |
+     | order/de-duplicate rowids,          |
+     | retrieve full record reads in rowid |
+     | order)                              |
+     \_____________________________________/
+       |   |   |
+      -|---|---|----- handler->read_range_first()/read_range_next(), 
+       |   |   |      handler->index_read(), handler->rnd_pos() calls.
+       |   |   |
+       v   v   v
+      Storage engine internals
+
+
+  Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines.
+  Potentially it can be used with any table handler that has disk-based data
+  storage and has better performance when reading data in rowid order.
+*/
+
+#include "sql_lifo_buffer.h"
+
+class DsMrr_impl;
+class Mrr_ordered_index_reader;
+
+
+/* A structure with key parameters that's shared among several classes */
+class Key_parameters
+{
+public:
+  uint         key_tuple_length; /* Length of index lookup tuple, in bytes */
+  key_part_map key_tuple_map;    /* keyparts used in index lookup tuples */
+
+  /*
+    This is 
+      = key_tuple_length   if we copy keys to buffer
+      = sizeof(void*)      if we're using pointers to materialized keys.
+  */
+  uint key_size_in_keybuf;
+
+  /* TRUE <=> don't copy key values, use pointers to them instead.  */
+  bool use_key_pointers;
 
-  This implementation makes range (and, in the future, 'ref') scans to read
-  table rows in disk sweeps. 
+  /* TRUE <=> We can get at most one index tuple for a lookup key */
+  bool index_ranges_unique;
+};
+
+
+/**
+  A class to enumerate (record, range_id) pairs that match given key value.
   
-  Currently it is used by MyISAM and InnoDB. Potentially it can be used with
-  any table handler that has non-clustered indexes and on-disk rows.
+  @note
+
+  The idea is that we have a Lifo_buffer which holds (key, range_id) pairs
+  ordered by key value. From the front of the buffer we see
+
+    (key_val1, range_id1), (key_val1, range_id2) ... (key_val2, range_idN)
+
+  we take the first elements that have the same key value (key_val1 in the
+  example above), and make lookup into the table.  The table will have 
+  multiple matches for key_val1:
+ 
+                  == Table Index ==
+                   ...
+     key_val1 ->  key_val1, index_tuple1
+                  key_val1, index_tuple2
+                   ...
+                  key_val1, index_tupleN
+                   ...
+  
+  Our goal is to produce all possible combinations, i.e. we need:
+  
+    {(key_val1, index_tuple1), range_id1}
+    {(key_val1, index_tuple1), range_id2}
+       ...           ...               |
+    {(key_val1, index_tuple1), range_idN},
+                  
+    {(key_val1, index_tuple2), range_id1}
+    {(key_val1, index_tuple2), range_id2}
+        ...          ...               |
+    {(key_val1, index_tuple2), range_idN},
+
+        ...          ...          ...                          
+
+    {(key_val1, index_tupleK), range_idN}
 */
 
-class DsMrr_impl
+class Key_value_records_iterator
 {
+  /* Use this to get table handler, key buffer and other parameters */
+  Mrr_ordered_index_reader *owner;
+
+  /* Iterator to get (key, range_id) pairs from */
+  Lifo_buffer_iterator identical_key_it;
+  
+  /* 
+    Last of the identical key values (when we get this pointer from
+    identical_key_it, it will be time to stop).
+  */
+  uchar *last_identical_key_ptr;
+
+  /*
+    FALSE <=> we're right after the init() call, the record has been already
+    read with owner->file->index_read_map() call
+  */
+  bool get_next_row;
+  
 public:
-  typedef void (handler::*range_check_toggle_func_t)(bool on);
+  int init(Mrr_ordered_index_reader *owner_arg);
+  int get_next(range_id_t *range_info);
+  void move_to_next_key_value();
+};
 
-  DsMrr_impl()
-    : h2(NULL) {};
+
+/*
+  Buffer manager interface. Mrr_reader objects use it to inqure DsMrr_impl
+  to manage buffer space for them.
+*/
+typedef struct st_buffer_manager
+{
+public:
+  /* Opaque value to be passed as the first argument to all member functions */
+  void *arg;
   
   /*
-    The "owner" handler object (the one that calls dsmrr_XXX functions.
-    It is used to retrieve full table rows by calling rnd_pos().
+    This is called when we've freed more space from the rowid buffer. The
+    callee will get the unused space from the rowid buffer and give it to the
+    key buffer.
+  */
+  void (*redistribute_buffer_space)(void *arg);
+
+  /* 
+    This is called when both key and rowid buffers are empty, and so it's time 
+    to reset them to their original size (They've lost their original size,
+    because we were dynamically growing rowid buffer and shrinking key buffer).
   */
-  handler *h;
-  TABLE *table; /* Always equal to h->table */
+  void (*reset_buffer_sizes)(void *arg);
+
+} Buffer_manager;
+
+
+/* 
+  Mrr_reader - DS-MRR execution strategy abstraction
+
+  A reader produces ([index]_record, range_info) pairs, and requires periodic
+  refill operations.
+
+  - one starts using the reader by calling reader->get_next(),
+  - when a get_next() call returns HA_ERR_END_OF_FILE, one must call 
+    refill_buffer() before they can make more get_next() calls.
+  - when refill_buffer() returns HA_ERR_END_OF_FILE, this means the real
+    end of stream and get_next() should not be called anymore.
+
+  Both functions can return other error codes, these mean unrecoverable errors
+  after which one cannot continue.
+*/
+
+class Mrr_reader 
+{
+public:
+  virtual int get_next(range_id_t *range_info) = 0;
+  virtual int refill_buffer(bool initial) = 0;
+  virtual ~Mrr_reader() {}; /* just to remove compiler warning */
+};
+
+
+/* 
+  A common base for readers that do index scans and produce index tuples 
+*/
+
+class Mrr_index_reader : public Mrr_reader
+{
+protected:
+  handler *file; /* Handler object to use */
+public:
+  virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                   void *seq_init_param, uint n_ranges,
+                   uint mode, Key_parameters *key_par, 
+                   Lifo_buffer *key_buffer, 
+                   Buffer_manager *buf_manager_arg) = 0;
+
+  /* Get pointer to place where every get_next() call will put rowid */
+  virtual uchar *get_rowid_ptr() = 0;
+  /* Get the rowid (call this after get_next() call) */
+  virtual void position();
+  virtual bool skip_record(range_id_t range_id, uchar *rowid) = 0;
+
+  virtual void interrupt_read() {}
+  virtual void resume_read() {}
+};
+
+
+/*
+  A "bypass" index reader that just does and index scan. The index scan is done 
+  by calling default MRR implementation (i.e.  handler::multi_range_read_XXX())
+  functions.
+*/
+
+class Mrr_simple_index_reader : public Mrr_index_reader
+{
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
+           void *seq_init_param, uint n_ranges,
+           uint mode, Key_parameters *key_par,
+           Lifo_buffer *key_buffer,
+           Buffer_manager *buf_manager_arg);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial) { return initial? 0: HA_ERR_END_OF_FILE; }
+  uchar *get_rowid_ptr() { return file->ref; }
+  bool skip_record(range_id_t range_id, uchar *rowid)
+  {
+    return (file->mrr_funcs.skip_record &&
+            file->mrr_funcs.skip_record(file->mrr_iter, range_id, rowid));
+  }
+};
+
+
+/* 
+  A reader that sorts the key values before it makes the index lookups.
+*/
+
+class Mrr_ordered_index_reader : public Mrr_index_reader
+{
+public:
+  int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+           void *seq_init_param, uint n_ranges,
+           uint mode, Key_parameters *key_par,
+           Lifo_buffer *key_buffer,
+           Buffer_manager *buf_manager_arg);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial);
+  uchar *get_rowid_ptr() { return file->ref; }
+  
+  bool skip_record(range_id_t range_info, uchar *rowid)
+  {
+    return (mrr_funcs.skip_record &&
+            mrr_funcs.skip_record(mrr_iter, range_info, rowid));
+  }
+
+  bool skip_index_tuple(range_id_t range_info)
+  {
+    return (mrr_funcs.skip_index_tuple &&
+            mrr_funcs.skip_index_tuple(mrr_iter, range_info));
+  }
+  
+  bool set_interruption_temp_buffer(uint rowid_length, uint key_len, 
+                                    uint saved_pk_len,
+                                    uchar **space_start, uchar *space_end);
+  void set_no_interruption_temp_buffer();
+
+  void interrupt_read();
+  void resume_read();
+  void position();
 private:
-  /* Secondary handler object.  It is used for scanning the index */
-  handler *h2;
+  Key_value_records_iterator kv_it;
+
+  bool scanning_key_val_iter;
+  
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+  
+  /* This manages key buffer allocation and sizing for us */
+  Buffer_manager *buf_manager;
 
-  /* Buffer to store rowids, or (rowid, range_id) pairs */
-  uchar *rowids_buf;
-  uchar *rowids_buf_cur;   /* Current position when reading/writing */
-  uchar *rowids_buf_last;  /* When reading: end of used buffer space */
-  uchar *rowids_buf_end;   /* End of the buffer */
+  Key_parameters  keypar; /* index scan and lookup tuple parameters */
 
-  bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+  
+  /* Range sequence iteration members */
+  RANGE_SEQ_IF mrr_funcs;
+  range_seq_t mrr_iter;
+  
+  /* TRUE == reached eof when enumerating ranges */
+  bool source_exhausted;
+   
+  /* 
+    Following members are for interrupt_read()/resume_read(). The idea is that 
+    in some cases index scan that is done by this object is interrupted by
+    rnd_pos() calls made by Mrr_ordered_rndpos_reader. The problem is that
+    we're sharing handler->record[0] with that object, and it destroys its
+    contents.
+    We need to save/restore our current
+    - index tuple (for pushed index condition checks)
+    - clustered primary key values (again, for pushed index condition checks)
+    - rowid of the last record we've retrieved (in case this rowid matches
+      multiple ranges and we'll need to return it again)
+  */ 
+  bool support_scan_interruptions;
+  /* Space where we save the rowid of the last record we've returned */
+  uchar *saved_rowid;
+  
+  /* TRUE <=> saved_rowid has the last saved rowid */
+  bool have_saved_rowid;
+  
+  uchar *saved_key_tuple; /* Saved current key tuple */
+  uchar *saved_primary_key; /* Saved current primary key tuple */
+
+  static int compare_keys(void* arg, uchar* key1, uchar* key2);
+  static int compare_keys_reverse(void* arg, uchar* key1, uchar* key2);
+  
+  friend class Key_value_records_iterator; 
+  friend class DsMrr_impl;
+  friend class Mrr_ordered_rndpos_reader;
+};
+
+
+/* 
+  A reader that gets rowids from an Mrr_index_reader, and then sorts them 
+  before getting full records with handler->rndpos() calls.
+*/
+
+class Mrr_ordered_rndpos_reader : public Mrr_reader 
+{
+public:
+  int init(handler *file, Mrr_index_reader *index_reader, uint mode,
+           Lifo_buffer *buf);
+  int get_next(range_id_t *range_info);
+  int refill_buffer(bool initial);
+private:
+  handler *file; /* Handler to use */
+  
+  /* This what we get (rowid, range_info) pairs from */
+  Mrr_index_reader *index_reader;
+
+  /* index_reader->get_next() puts rowid here */
+  uchar *index_rowid;
+  
+  /* TRUE <=> index_reader->refill_buffer() call has returned EOF */
+  bool index_reader_exhausted;
+  
+  /* 
+    TRUE <=> We should call index_reader->refill_buffer(). This happens if
+    1. we've made index_reader->get_next() call which returned EOF
+    2. we haven't made any index_reader calls (and our first call should 
+       be index_reader->refill_buffer(initial=TRUE)
+  */
+  bool index_reader_needs_refill;
 
-  /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
   bool is_mrr_assoc;
+  
+  /* 
+    When reading from ordered rowid buffer: the rowid element of the last
+    buffer element that has rowid identical to this one.
+  */
+  uchar *last_identical_rowid;
+
+  /* Buffer to store (rowid, range_id) pairs */
+  Lifo_buffer *rowid_buffer;
+  
+  int refill_from_index_reader();
+};
+
+
+/*
+  A primitive "factory" of various Mrr_*_reader classes (the point is to 
+  get various kinds of readers without having to allocate them on the heap)
+*/
+
+class Mrr_reader_factory
+{
+public:
+  Mrr_ordered_rndpos_reader ordered_rndpos_reader;
+  Mrr_ordered_index_reader  ordered_index_reader;
+  Mrr_simple_index_reader   simple_index_reader;
+};
+
+
+#define DSMRR_IMPL_SORT_KEYS   HA_MRR_IMPLEMENTATION_FLAG1
+#define DSMRR_IMPL_SORT_ROWIDS HA_MRR_IMPLEMENTATION_FLAG2
+
+/*
+  DS-MRR implementation for one table. Create/use one object of this class for
+  each ha_{myisam/innobase/etc} object. That object will be further referred to
+  as "the handler"
+
+  DsMrr_impl supports has the following execution strategies:
+
+  - Bypass DS-MRR, pass all calls to default MRR implementation, which is 
+    an MRR-to-non-MRR call converter.
+  - Key-Ordered Retrieval
+  - Rowid-Ordered Retrieval
+
+  DsMrr_impl will use one of the above strategies, or a combination of them, 
+  according to the following diagram:
+
+         (mrr function calls)
+                |
+                +----------------->-----------------+
+                |                                   |
+     ___________v______________      _______________v________________
+    / default: use lookup keys \    / KEY-ORDERED RETRIEVAL:         \
+    | (or ranges) in whatever  |    | sort lookup keys and then make | 
+    | order they are supplied  |    | index lookups in index order   |
+    \__________________________/    \________________________________/
+              | |  |                           |    |
+      +---<---+ |  +--------------->-----------|----+
+      |         |                              |    |
+      |         |              +---------------+    |
+      |   ______v___ ______    |     _______________v_______________
+      |  / default: read   \   |    / ROWID-ORDERED RETRIEVAL:      \
+      |  | table records   |   |    | Before reading table records, |
+      v  | in random order |   v    | sort their rowids and then    |
+      |  \_________________/   |    | read them in rowid order      |
+      |         |              |    \_______________________________/
+      |         |              |                    |
+      |         |              |                    |
+      +-->---+  |  +----<------+-----------<--------+
+             |  |  |                                
+             v  v  v
+      (table records and range_ids)
+
+  The choice of strategy depends on MRR scan properties, table properties
+  (whether we're scanning clustered primary key), and @@optimizer_switch
+  settings.
+  
+  Key-Ordered Retrieval
+  ---------------------
+  The idea is: if MRR scan is essentially a series of lookups on 
+   
+    tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN
+  
+  then it makes sense to collect and order the set of lookup values, i.e.
+   
+     sort(value1, value2, .. valueN)
+
+  and then do index lookups in index order. This results in fewer index page
+  fetch operations, and we also can avoid making multiple index lookups for the
+  same value. That is, if value1=valueN we can easily discover that after
+  sorting and make one index lookup for them instead of two.
+
+  Rowid-Ordered Retrieval
+  -----------------------
+  If we do a regular index scan or a series of index lookups, we'll be hitting
+  table records at random. For disk-based engines, this is much slower than 
+  reading the same records in disk order. We assume that disk ordering of
+  rows is the same as ordering of their rowids (which is provided by 
+  handler::cmp_ref())
+  In order to retrieve records in different order, we must separate index
+  scanning and record fetching, that is, MRR scan uses the following steps:
+
+    1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and 
+        fill a buffer with {rowid, range_id} pairs
+    2. Sort the buffer by rowid value
+    3. for each {rowid, range_id} pair in the buffer
+         get record by rowid and return the {record, range_id} pair
+    4. Repeat the above steps until we've exhausted the list of ranges we're
+       scanning.
 
-  bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */
+  Buffer space management considerations
+  --------------------------------------
+  With regards to buffer/memory management, MRR interface specifies that 
+   - SQL layer provides multi_range_read_init() with buffer of certain size.
+   - MRR implementation may use (i.e. have at its disposal till the end of 
+     the MRR scan) all of the buffer, or return the unused end of the buffer 
+     to SQL layer.
+
+  DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When
+  we need to accumulate/sort only keys (or only rowids), it is fairly trivial.
+
+  When we need to accumulate/sort both keys and rowids, efficient buffer use
+  gets complicated. We need to:
+   - First, accumulate keys and sort them
+   - Then use the keys (smaller values go first) to obtain rowids. A key is not
+     needed after we've got matching rowids for it.
+   - Make sure that rowids are accumulated at the front of the buffer, so that we
+     can return the end part of the buffer to SQL layer, should there be too
+     few rowid values to occupy the buffer.
+
+  All of these goals are achieved by using the following scheme:
+
+     |                    |   We get an empty buffer from SQL layer.   
+
+     |                  *-|    
+     |               *----|   First, we fill the buffer with keys. Key_buffer
+     |            *-------|   part grows from end of the buffer space to start
+     |         *----------|   (In this picture, the buffer is big enough to
+     |      *-------------|    accomodate all keys and even have some space left)
+
+     |      *=============|   We want to do key-ordered index scan, so we sort
+                              the keys
+
+     |-x      *===========|   Then we use the keys get rowids. Rowids are 
+     |----x      *========|   stored from start of buffer space towards the end.
+     |--------x     *=====|   The part of the buffer occupied with keys
+     |------------x   *===|   gradually frees up space for rowids. In this
+     |--------------x   *=|   picture we run out of keys before we've ran out
+     |----------------x   |   of buffer space (it can be other way as well).
+
+     |================x   |   Then we sort the rowids.
+                     
+     |                |~~~|   The unused part of the buffer is at the end, so
+                              we can return it to the SQL layer.
+
+     |================*       Sorted rowids are then used to read table records 
+                              in disk order
+
+*/
+
+class DsMrr_impl
+{
 public:
+  typedef void (handler::*range_check_toggle_func_t)(bool on);
+
+  DsMrr_impl()
+    : secondary_file(NULL) {};
+  
   void init(handler *h_arg, TABLE *table_arg)
   {
-    h= h_arg; 
+    primary_file= h_arg; 
     table= table_arg;
   }
-  int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, 
-                 uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, 
+                 void *seq_init_param, uint n_ranges, uint mode, 
+                 HANDLER_BUFFER *buf);
   void dsmrr_close();
-  int dsmrr_fill_buffer();
-  int dsmrr_next(char **range_info);
+  int dsmrr_next(range_id_t *range_info);
 
-  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz,
-                     uint *flags, COST_VECT *cost);
+  ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, 
+                     uint *bufsz, uint *flags, COST_VECT *cost);
 
   ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, 
                             void *seq_init_param, uint n_ranges, uint *bufsz,
                             uint *flags, COST_VECT *cost);
+
+  int dsmrr_explain_info(uint mrr_mode, char *str, size_t size);
 private:
+  /* Buffer to store (key, range_id) pairs */
+  Lifo_buffer *key_buffer;
+
+  /*
+    The "owner" handler object (the one that is expected to "own" this object
+    and call its functions).
+  */
+  handler *primary_file;
+  TABLE *table; /* Always equal to primary_file->table */
+
+  /*
+    Secondary handler object. (created when needed, we need it when we need 
+    to run both index scan and rnd_pos() scan at the same time)
+  */
+  handler *secondary_file;
+  
+  uint keyno; /* index we're running the scan on */
+  /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */
+  bool is_mrr_assoc;
+
+  Mrr_reader_factory reader_factory;
+
+  Mrr_reader *strategy;
+  bool strategy_exhausted;
+
+  Mrr_index_reader *index_strategy;
+
+  /* The whole buffer space that we're using */
+  uchar *full_buf;
+  uchar *full_buf_end;
+  
+  /* 
+    When using both rowid and key buffers: the boundary between key and rowid
+    parts of the buffer. This is the "original" value, actual memory ranges 
+    used by key and rowid parts may be different because of dynamic space 
+    reallocation between them.
+  */
+  uchar *rowid_buffer_end;
+ 
+  /*
+    One of the following two is used for key buffer: forward is used when 
+    we only need key buffer, backward is used when we need both key and rowid
+    buffers.
+  */
+  Forward_lifo_buffer forward_key_buf;
+  Backward_lifo_buffer backward_key_buf;
+
+  /*
+    Buffer to store (rowid, range_id) pairs, or just rowids if 
+    is_mrr_assoc==FALSE
+  */
+  Forward_lifo_buffer rowid_buffer;
+  
   bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, 
                        COST_VECT *cost);
   bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, 
                                uint *buffer_size, COST_VECT *cost);
+  bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags);
+
+  bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
+
+  /* Buffer_manager and its member functions */
+  Buffer_manager buf_manager;
+  static void redistribute_buffer_space(void *dsmrr_arg);
+  static void reset_buffer_sizes(void *dsmrr_arg);
+  static void do_nothing(void *dsmrr_arg);
+
+  Lifo_buffer* get_key_buffer() { return key_buffer; }
+
+  friend class Key_value_records_iterator;
+  friend class Mrr_ordered_index_reader;
+  friend class Mrr_ordered_rndpos_reader;
+
+  int  setup_two_handlers();
+  void close_second_handler();
 };
 
+/**
+  @} (end of group DS-MRR declarations)
+*/
+
diff --git a/sql/my_decimal.cc b/sql/my_decimal.cc
index 16449da701f..1a60d199b13 100644
--- a/sql/my_decimal.cc
+++ b/sql/my_decimal.cc
@@ -21,6 +21,10 @@
 #include "sql_class.h"                          // THD
 #endif
 
+#define DIG_BASE     1000000000
+#define DIG_PER_DEC1 9
+#define ROUND_UP(X)  (((X)+DIG_PER_DEC1-1)/DIG_PER_DEC1)
+
 #ifndef MYSQL_CLIENT
 /**
   report result of decimal operation.
@@ -34,21 +38,20 @@
     result
 */
 
-int decimal_operation_results(int result)
+int decimal_operation_results(int result, const char *value, const char *type)
 {
   switch (result) {
   case E_DEC_OK:
     break;
   case E_DEC_TRUNCATED:
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-			WARN_DATA_TRUNCATED, ER(WARN_DATA_TRUNCATED),
-			"", (ulong) 0);
+			ER_DATA_TRUNCATED, ER(ER_DATA_TRUNCATED),
+			value, type);
     break;
   case E_DEC_OVERFLOW:
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                        ER_TRUNCATED_WRONG_VALUE,
-                        ER(ER_TRUNCATED_WRONG_VALUE),
-			"DECIMAL", "");
+                        ER_DATA_OVERFLOW, ER(ER_DATA_OVERFLOW),
+			value, type);
     break;
   case E_DEC_DIV_ZERO:
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
@@ -56,9 +59,8 @@ int decimal_operation_results(int result)
     break;
   case E_DEC_BAD_NUM:
     push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-			ER_TRUNCATED_WRONG_VALUE_FOR_FIELD,
-			ER(ER_TRUNCATED_WRONG_VALUE_FOR_FIELD),
-			"decimal", "", "", (ulong) 0);
+			ER_BAD_DATA, ER(ER_BAD_DATA),
+			value, type);
     break;
   case E_DEC_OOM:
     my_error(ER_OUT_OF_RESOURCES, MYF(0));
@@ -266,20 +268,69 @@ int str2my_decimal(uint mask, const char *from, uint length,
 }
 
 
+/**
+  converts a decimal into a pair of integers - for integer and fractional parts
+
+  special version, for decimals representing number of seconds.
+  integer part cannot be larger that 1e18 (otherwise it's an overflow).
+  fractional part is microseconds.
+*/
+bool my_decimal2seconds(const my_decimal *d, ulonglong *sec, ulong *microsec)
+{
+  int pos;
+  
+  if (d->intg)
+  {
+    pos= (d->intg-1)/DIG_PER_DEC1;
+    *sec= d->buf[pos];
+    if (pos > 0)
+      *sec+= static_cast<longlong>(d->buf[pos-1]) * DIG_BASE;
+  }
+  else
+  {
+    *sec=0;
+    pos= -1;
+  }
+
+  *microsec= d->frac ? static_cast<longlong>(d->buf[pos+1]) / (DIG_BASE/1000000) : 0;
+
+  if (pos > 1)
+  {
+    for (int i=0; i < pos-1; i++)
+      if (d->buf[i])
+      {
+        *sec= LONGLONG_MAX;
+        break;
+      }
+  }
+  return d->sign();
+}
+
+
+/**
+  converts a pair of integers (seconds, microseconds) into a decimal
+*/
+my_decimal *seconds2my_decimal(bool sign,
+                               ulonglong sec, ulong microsec, my_decimal *d)
+{
+  d->init();
+  longlong2decimal(sec, d); // cannot fail
+  if (microsec)
+  {
+    d->buf[(d->intg-1) / DIG_PER_DEC1 + 1]= microsec * (DIG_BASE/1000000);
+    d->frac= 6;
+  }
+  ((decimal_t *)d)->sign= sign;
+  return d;
+}
+
+
 my_decimal *date2my_decimal(MYSQL_TIME *ltime, my_decimal *dec)
 {
-  longlong date;
-  date = (ltime->year*100L + ltime->month)*100L + ltime->day;
+  longlong date= (ltime->year*100L + ltime->month)*100L + ltime->day;
   if (ltime->time_type > MYSQL_TIMESTAMP_DATE)
     date= ((date*100L + ltime->hour)*100L+ ltime->minute)*100L + ltime->second;
-  if (int2my_decimal(E_DEC_FATAL_ERROR, ltime->neg ? -date : date, FALSE, dec))
-    return dec;
-  if (ltime->second_part)
-  {
-    dec->buf[(dec->intg-1) / 9 + 1]= ltime->second_part * 1000;
-    dec->frac= 6;
-  }
-  return dec;
+  return seconds2my_decimal(ltime->neg, date, ltime->second_part, dec);
 }
 
 
@@ -294,12 +345,37 @@ void my_decimal_trim(ulong *precision, uint *scale)
 }
 
 
+/*
+  Convert a decimal to an ulong with a descriptive error message
+*/
+
+int my_decimal2int(uint mask, const decimal_t *d, bool unsigned_flag,
+		   longlong *l)
+{
+  int res;
+  my_decimal rounded;
+  /* decimal_round can return only E_DEC_TRUNCATED */
+  decimal_round(d, &rounded, 0, HALF_UP);
+  res= (unsigned_flag ?
+        decimal2ulonglong(&rounded, (ulonglong *) l) :
+        decimal2longlong(&rounded, l));
+  if (res & mask)
+  {
+    char buff[DECIMAL_MAX_STR_LENGTH];
+    int length= sizeof(buff);
+    decimal2string(d, buff, &length, 0, 0, 0);
+
+    decimal_operation_results(res, buff,
+                              unsigned_flag ? "UNSIGNED INT" :
+                              "INT");
+  }
+  return res;
+}
+
+
 #ifndef DBUG_OFF
 /* routines for debugging print */
 
-#define DIG_PER_DEC1 9
-#define ROUND_UP(X)  (((X)+DIG_PER_DEC1-1)/DIG_PER_DEC1)
-
 /* print decimal */
 void
 print_decimal(const my_decimal *dec)
@@ -340,7 +416,6 @@ const char *dbug_decimal_as_string(char *buff, const my_decimal *val)
   return buff;
 }
 
-#endif /*DBUG_OFF*/
-
 
+#endif /*DBUG_OFF*/
 #endif /*MYSQL_CLIENT*/
diff --git a/sql/my_decimal.h b/sql/my_decimal.h
index f566cbe6f26..3a309209b90 100644
--- a/sql/my_decimal.h
+++ b/sql/my_decimal.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2005, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,38 +35,12 @@
 
 C_MODE_START
 #include <decimal.h>
+#include <my_decimal_limits.h>
 C_MODE_END
 
 class String;
 typedef struct st_mysql_time MYSQL_TIME;
 
-#define DECIMAL_LONGLONG_DIGITS 22
-#define DECIMAL_LONG_DIGITS 10
-#define DECIMAL_LONG3_DIGITS 8
-
-/** maximum length of buffer in our big digits (uint32). */
-#define DECIMAL_BUFF_LENGTH 9
-
-/* the number of digits that my_decimal can possibly contain */
-#define DECIMAL_MAX_POSSIBLE_PRECISION (DECIMAL_BUFF_LENGTH * 9)
-
-
-/**
-  maximum guaranteed precision of number in decimal digits (number of our
-  digits * number of decimal digits in one our big digit - number of decimal
-  digits in one our big digit decreased by 1 (because we always put decimal
-  point on the border of our big digits))
-*/
-#define DECIMAL_MAX_PRECISION (DECIMAL_MAX_POSSIBLE_PRECISION - 8*2)
-#define DECIMAL_MAX_SCALE 30
-#define DECIMAL_NOT_SPECIFIED 31
-
-/**
-  maximum length of string representation (number of maximum decimal
-  digits + 1 position for sign + 1 position for decimal point, no terminator)
-*/
-#define DECIMAL_MAX_STR_LENGTH (DECIMAL_MAX_POSSIBLE_PRECISION + 2)
-
 /**
   maximum size of packet length.
 */
@@ -165,9 +140,10 @@ bool str_set_decimal(uint mask, const my_decimal *val, uint fixed_prec,
 extern my_decimal decimal_zero;
 
 #ifndef MYSQL_CLIENT
-int decimal_operation_results(int result);
+int decimal_operation_results(int result, const char *value, const char *type);
 #else
-inline int decimal_operation_results(int result)
+inline int decimal_operation_results(int result, const char *value,
+                                     const char *type)
 {
   return result;
 }
@@ -189,7 +165,7 @@ inline void max_internal_decimal(my_decimal *to)
 inline int check_result(uint mask, int result)
 {
   if (result & mask)
-    decimal_operation_results(result);
+    decimal_operation_results(result, "", "DECIMAL");
   return result;
 }
 
@@ -337,21 +313,16 @@ int my_decimal2string(uint mask, const my_decimal *d, uint fixed_prec,
 		      uint fixed_dec, char filler, String *str);
 #endif
 
-inline
-int my_decimal2int(uint mask, const my_decimal *d, my_bool unsigned_flag,
-		   longlong *l)
-{
-  my_decimal rounded;
-  /* decimal_round can return only E_DEC_TRUNCATED */
-  decimal_round(d, &rounded, 0, HALF_UP);
-  return check_result(mask, (unsigned_flag ?
-			     decimal2ulonglong(&rounded, (ulonglong *)l) :
-			     decimal2longlong(&rounded, l)));
-}
+bool my_decimal2seconds(const my_decimal *d, ulonglong *sec, ulong *microsec);
+
+my_decimal *seconds2my_decimal(bool sign, ulonglong sec, ulong microsec,
+                               my_decimal *d);
 
+int my_decimal2int(uint mask, const decimal_t *d, bool unsigned_flag,
+		   longlong *l);
 
 inline
-int my_decimal2double(uint, const my_decimal *d, double *result)
+int my_decimal2double(uint, const decimal_t *d, double *result)
 {
   /* No need to call check_result as this will always succeed */
   return decimal2double(d, result);
@@ -396,6 +367,16 @@ int int2my_decimal(uint mask, longlong i, my_bool unsigned_flag, my_decimal *d)
 			     longlong2decimal(i, d)));
 }
 
+inline
+void decimal2my_decimal(decimal_t *from, my_decimal *to)
+{
+  DBUG_ASSERT(to->len >= from->len);
+  to->intg= from->intg;
+  to->frac= from->frac;
+  to->sign(from->sign);
+  memcpy(to->buf, from->buf, to->len*sizeof(decimal_digit_t));
+}
+
 
 inline
 void my_decimal_neg(decimal_t *arg)
@@ -458,7 +439,6 @@ int my_decimal_mod(uint mask, my_decimal *res, const my_decimal *a,
                                    res);
 }
 
-
 /**
   @return
     -1 if a<b, 1 if a>b and 0 if a==b
diff --git a/sql/mysql_install_db.cc b/sql/mysql_install_db.cc
new file mode 100644
index 00000000000..086dc292dec
--- /dev/null
+++ b/sql/mysql_install_db.cc
@@ -0,0 +1,637 @@
+/* Copyright (C) 2010-2011 Monty Program Ab & Vladislav Vaintroub
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  mysql_install_db creates a new database instance (optionally as service)
+  on Windows.
+*/
+#define DONT_DEFINE_VOID
+#include <my_global.h>
+#include <my_getopt.h>
+#include <my_sys.h>
+#include <m_string.h>
+
+#include <windows.h>
+#include <assert.h>
+#include <shellapi.h>
+#include <accctrl.h>
+#include <aclapi.h>
+
+#define USAGETEXT \
+"mysql_install_db.exe  Ver 1.00 for Windows\n" \
+"Copyright (C) 2010-2011 Monty Program Ab & Vladislav Vaintroub\n" \
+"This software comes with ABSOLUTELY NO WARRANTY. This is free software,\n" \
+"and you are welcome to modify and redistribute it under the GPL v2 license\n" \
+"Usage: mysql_install_db.exe [OPTIONS]\n" \
+"OPTIONS:"
+
+extern "C" const char mysql_bootstrap_sql[];
+
+char default_os_user[]= "NT AUTHORITY\\NetworkService";
+static int create_db_instance();
+static uint opt_silent;
+static char datadir_buffer[FN_REFLEN];
+static char mysqld_path[FN_REFLEN];
+static char *opt_datadir;
+static char *opt_service;
+static char *opt_password;
+static int  opt_port;
+static char *opt_socket;
+static char *opt_os_user;
+static char *opt_os_password;
+static my_bool opt_default_user;
+static my_bool opt_allow_remote_root_access;
+static my_bool opt_skip_networking;
+static my_bool verbose_errors;
+
+
+static struct my_option my_long_options[]=
+{
+  {"help", '?', "Display this help message and exit.", 0, 0, 0, GET_NO_ARG,
+   NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"datadir", 'd', "Data directory of the new database",
+  &opt_datadir, &opt_datadir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"service", 'S', "Name of the Windows service",
+  &opt_service, &opt_service, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"password", 'p', "Root password",
+  &opt_password, &opt_password, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"port", 'P', "mysql port",
+  &opt_port, &opt_port, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"socket", 'W', 
+  "named pipe name (if missing, it will be set the same as service)",
+  &opt_socket, &opt_socket, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"default-user", 'D', "Create default user",
+  &opt_default_user, &opt_default_user, 0 , GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+  {"allow-remote-root-access", 'R', 
+  "Allows remote access from network for user root",
+  &opt_allow_remote_root_access, &opt_allow_remote_root_access, 0 , GET_BOOL, 
+  OPT_ARG, 0, 0, 0, 0, 0, 0},
+  {"skip-networking", 'N', "Do not use TCP connections, use pipe instead",
+  &opt_skip_networking, &opt_skip_networking, 0 , GET_BOOL, OPT_ARG, 0, 0, 0, 0,
+  0, 0},
+  {"silent", 's', "Print less information", &opt_silent,
+   &opt_silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, 
+   const struct my_option *opt __attribute__ ((unused)),
+   char *argument __attribute__ ((unused)))
+{
+  DBUG_ENTER("get_one_option");
+  switch (optid) {
+  case '?':
+    printf("%s\n", USAGETEXT);
+    my_print_help(my_long_options);
+    exit(0);
+    break;
+  }
+  DBUG_RETURN(0);
+}
+
+
+static void die(const char *fmt, ...)
+{
+  va_list args;
+  DBUG_ENTER("die");
+
+  /* Print the error message */
+  va_start(args, fmt);
+  fprintf(stderr, "FATAL ERROR: ");
+  vfprintf(stderr, fmt, args);
+  fputc('\n', stderr);
+  if (verbose_errors)
+  {
+   fprintf(stderr,
+   "http://kb.askmonty.org/v/installation-issues-on-windows contains some help\n"
+   "for solving the most common problems.  If this doesn't help you, please\n"
+   "leave a comment in the knowledge base or file a bug report at\n"
+   "https://bugs.launchpad.net/maria");
+  }
+  fflush(stderr);
+  va_end(args);
+  my_end(0);
+  exit(1);
+}
+
+
+static void verbose(const char *fmt, ...)
+{
+  va_list args;
+
+  if (opt_silent)
+    return;
+
+  /* Print the verbose message */
+  va_start(args, fmt);
+  vfprintf(stdout, fmt, args);
+  fputc('\n', stdout);
+  fflush(stdout);
+  va_end(args);
+}
+
+
+int main(int argc, char **argv)
+{
+  int error;
+  char self_name[FN_REFLEN];
+  char *p;
+
+  MY_INIT(argv[0]);
+  GetModuleFileName(NULL, self_name, FN_REFLEN);
+  strcpy(mysqld_path,self_name);
+  p= strrchr(mysqld_path, FN_LIBCHAR);
+  if (p)
+  {
+    strcpy(p, "\\mysqld.exe");
+  }
+
+  if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(error);
+  if (!opt_datadir)
+  {
+    my_print_help(my_long_options);
+    die("parameter --datadir=# is mandatory");
+  }
+
+  /* Print some help on errors */
+  verbose_errors= TRUE;
+
+  if (!opt_os_user)
+  {
+    opt_os_user= default_os_user;
+    opt_os_password= NULL;
+  }
+  /* Workaround WiX bug (strip possible quote character at the end of path) */
+  size_t len= strlen(opt_datadir);
+  if (len > 0)
+  {
+    if (opt_datadir[len-1] == '"')
+    {
+      opt_datadir[len-1]= 0;
+    }
+  }
+  GetFullPathName(opt_datadir, FN_REFLEN, datadir_buffer, NULL);
+  opt_datadir= datadir_buffer;
+
+  if (create_db_instance())
+  {
+    die("database creation failed");
+  }
+
+  printf("Creation of the database was successfull");
+  return 0;
+}
+
+
+
+/**
+  Convert slashes in paths into MySQL-compatible form
+*/
+
+static void convert_slashes(char *s)
+{
+  for (; *s ; s++)
+   if (*s == '\\')
+     *s= '/';
+}
+
+
+/**
+  Calculate basedir from mysqld.exe path.
+  Basedir assumed to be is one level up from the mysqld.exe directory location.
+  E.g basedir for C:\my\bin\mysqld.exe would be C:\my
+*/
+
+static void get_basedir(char *basedir, int size, const char *mysqld_path)
+{
+  strcpy_s(basedir, size,  mysqld_path);
+  convert_slashes(basedir);
+  char *p= strrchr(basedir,'/');
+  if (p)
+  {
+    *p = 0;
+    p= strrchr(basedir, '/');
+    if (p)
+      *p= 0;
+  }
+}
+
+
+/**
+  Allocate and initialize command line for mysqld --bootstrap.
+ The resulting string is passed to popen, so it has a lot of quoting
+ quoting around the full string plus quoting around parameters with spaces.
+*/
+
+static char *init_bootstrap_command_line(char *cmdline, size_t size)
+{
+  char basedir[MAX_PATH];
+  get_basedir(basedir, sizeof(basedir), mysqld_path);
+
+  my_snprintf(cmdline, size-1, 
+    "\"\"%s\" --no-defaults --bootstrap"
+    " \"--language=%s\\share\\english\""
+    " --basedir=. --datadir=. --default-storage-engine=myisam"
+    " --max_allowed_packet=9M --loose-skip-innodb --loose-skip-pbxt"
+    " --net-buffer-length=16k\"", mysqld_path, basedir);
+  return cmdline;
+}
+
+
+/**
+  Create my.ini in  current directory (this is assumed to be
+  data directory as well).
+*/
+
+static int create_myini()
+{
+  my_bool enable_named_pipe= FALSE;
+  printf("Creating my.ini file\n");
+
+  char path_buf[MAX_PATH];
+  GetCurrentDirectory(MAX_PATH, path_buf);
+
+  /* Create ini file. */
+  FILE *myini= fopen("my.ini","wt");
+  if (!myini)
+  {
+    die("Cannot create my.ini in data directory");
+  }
+
+  /* Write out server settings. */
+  fprintf(myini, "[mysqld]\n");
+  convert_slashes(path_buf);
+  fprintf(myini, "datadir=%s\n", path_buf);
+  if (opt_skip_networking)
+  {
+    fprintf(myini,"skip-networking\n");
+    if (!opt_socket)
+      opt_socket= opt_service;
+  }
+  enable_named_pipe= (my_bool) 
+    ((opt_socket && opt_socket[0]) || opt_skip_networking);
+
+  if (enable_named_pipe)
+  {
+    fprintf(myini,"enable-named-pipe\n");
+  }
+
+  if (opt_socket && opt_socket[0])
+  {
+    fprintf(myini, "socket=%s\n", opt_socket);
+  }
+  if (opt_port)
+  {
+    fprintf(myini,"port=%d\n", opt_port);
+  }
+
+  /* Write out client settings. */
+  fprintf(myini, "[client]\n");
+
+  /* Used for named pipes */
+  if (opt_socket && opt_socket[0])
+    fprintf(myini,"socket=%s\n",opt_socket);
+  if (opt_skip_networking)
+    fprintf(myini,"protocol=pipe\n");
+  else if (opt_port)
+    fprintf(myini,"port=%d\n",opt_port);
+  fclose(myini);
+  return 0;
+}
+
+
+static const char update_root_passwd_part1[]=
+  "UPDATE mysql.user SET Password = PASSWORD('";
+static const char update_root_passwd_part2[]=
+  "') where User='root';\n";
+static const char remove_default_user_cmd[]= 
+  "DELETE FROM mysql.user where User='';\n";
+static const char allow_remote_root_access_cmd[]=
+  "CREATE TEMPORARY TABLE tmp_user LIKE user;\n"
+  "INSERT INTO tmp_user SELECT * from user where user='root' "
+    " AND host='localhost';\n"
+  "UPDATE tmp_user SET host='%';\n"
+  "INSERT INTO user SELECT * FROM tmp_user;\n"
+  "DROP TABLE tmp_user;\n";
+static const char end_of_script[]="-- end.";
+
+/* Register service. Assume my.ini is in datadir */
+
+static int register_service()
+{
+  char buf[3*MAX_PATH +32]; /* path to mysqld.exe, to my.ini, service name */
+  SC_HANDLE sc_manager, sc_service;
+
+  size_t datadir_len= strlen(opt_datadir);
+  const char *backslash_after_datadir= "\\";
+
+  if (datadir_len && opt_datadir[datadir_len-1] == '\\')
+    backslash_after_datadir= "";
+
+  verbose("Registering service '%s'", opt_service);
+  my_snprintf(buf, sizeof(buf)-1,
+    "\"%s\" \"--defaults-file=%s%smy.ini\" \"%s\"" ,  mysqld_path, opt_datadir, 
+    backslash_after_datadir, opt_service);
+
+  /* Get a handle to the SCM database. */ 
+  sc_manager= OpenSCManager( NULL, NULL, SC_MANAGER_ALL_ACCESS);
+  if (!sc_manager) 
+  {
+    die("OpenSCManager failed (%u)\n", GetLastError());
+  }
+
+  /* Create the service. */
+  sc_service= CreateService(sc_manager, opt_service,  opt_service,
+    SERVICE_ALL_ACCESS, SERVICE_WIN32_OWN_PROCESS, SERVICE_AUTO_START, 
+    SERVICE_ERROR_NORMAL, buf, NULL, NULL, NULL, opt_os_user, opt_os_password);
+
+  if (!sc_service) 
+  {
+    CloseServiceHandle(sc_manager);
+    die("CreateService failed (%u)", GetLastError());
+  }
+
+  SERVICE_DESCRIPTION sd= { "MariaDB database server" };
+  ChangeServiceConfig2(sc_service, SERVICE_CONFIG_DESCRIPTION, &sd);
+  CloseServiceHandle(sc_service); 
+  CloseServiceHandle(sc_manager);
+  return 0;
+}
+
+
+static void clean_directory(const char *dir)
+{
+  char dir2[MAX_PATH+2];
+  *(strmake(dir2, dir, MAX_PATH+1)+1)= 0;
+
+  SHFILEOPSTRUCT fileop;
+  fileop.hwnd= NULL;    /* no status display */
+  fileop.wFunc= FO_DELETE;  /* delete operation */
+  fileop.pFrom= dir2;  /* source file name as double null terminated string */
+  fileop.pTo= NULL;    /* no destination needed */
+  fileop.fFlags= FOF_NOCONFIRMATION|FOF_SILENT;  /* do not prompt the user */
+
+
+  fileop.fAnyOperationsAborted= FALSE;
+  fileop.lpszProgressTitle= NULL;
+  fileop.hNameMappings= NULL;
+
+  SHFileOperation(&fileop);
+}
+
+
+/*
+  Define directory permission to have inheritable all access for a user
+  (defined as username or group string or as SID)
+*/
+
+static int set_directory_permissions(const char *dir, const char *os_user)
+{
+
+   struct{
+        TOKEN_USER tokenUser;
+        BYTE buffer[SECURITY_MAX_SID_SIZE];
+   } tokenInfoBuffer;
+
+  HANDLE hDir= CreateFile(dir,READ_CONTROL|WRITE_DAC,0,NULL,OPEN_EXISTING,
+    FILE_FLAG_BACKUP_SEMANTICS,NULL);
+  if (hDir == INVALID_HANDLE_VALUE) 
+    return -1;  
+  ACL* pOldDACL;
+  SECURITY_DESCRIPTOR* pSD= NULL; 
+  EXPLICIT_ACCESS ea={0};
+  BOOL isWellKnownSID= FALSE;
+  WELL_KNOWN_SID_TYPE wellKnownSidType = WinNullSid;
+  PSID pSid= NULL;
+
+  GetSecurityInfo(hDir, SE_FILE_OBJECT , DACL_SECURITY_INFORMATION,NULL, NULL,
+    &pOldDACL, NULL, (void**)&pSD); 
+
+  if (os_user)
+  {
+    /* Check for 3 predefined service users 
+       They might have localized names in non-English Windows, thus they need
+       to be handled using well-known SIDs.
+    */
+    if (stricmp(os_user, "NT AUTHORITY\\NetworkService") == 0)
+    {
+      wellKnownSidType= WinNetworkServiceSid;
+    }
+    else if (stricmp(os_user, "NT AUTHORITY\\LocalService") == 0)
+    {
+      wellKnownSidType= WinLocalServiceSid;
+    }
+    else if (stricmp(os_user, "NT AUTHORITY\\LocalSystem") == 0)
+    {
+      wellKnownSidType= WinLocalSystemSid;
+    }
+
+    if (wellKnownSidType != WinNullSid)
+    {
+      DWORD size= SECURITY_MAX_SID_SIZE;
+      pSid= (PSID)tokenInfoBuffer.buffer;
+      if (!CreateWellKnownSid(wellKnownSidType, NULL, pSid,
+        &size))
+      {
+        return 1;
+      }
+      ea.Trustee.TrusteeForm= TRUSTEE_IS_SID;
+      ea.Trustee.ptstrName= (LPTSTR)pSid;
+    }
+    else
+    {
+      ea.Trustee.TrusteeForm= TRUSTEE_IS_NAME;
+      ea.Trustee.ptstrName= (LPSTR)os_user;
+    }
+  }
+  else
+  {
+    HANDLE token;
+    if (OpenProcessToken(GetCurrentProcess(),TOKEN_QUERY, &token))
+    {
+
+      DWORD length= (DWORD) sizeof(tokenInfoBuffer);
+      if (GetTokenInformation(token, TokenUser, &tokenInfoBuffer, 
+        length, &length))
+      {
+        pSid= tokenInfoBuffer.tokenUser.User.Sid;
+      }
+    }
+    if (!pSid)
+      return 0;
+    ea.Trustee.TrusteeForm= TRUSTEE_IS_SID;
+    ea.Trustee.ptstrName= (LPTSTR)pSid;
+  }
+  ea.grfAccessMode= GRANT_ACCESS;
+  ea.grfAccessPermissions= GENERIC_ALL; 
+  ea.grfInheritance= CONTAINER_INHERIT_ACE|OBJECT_INHERIT_ACE; 
+  ea.Trustee.TrusteeType= TRUSTEE_IS_UNKNOWN; 
+  ACL* pNewDACL= 0; 
+  DWORD err= SetEntriesInAcl(1,&ea,pOldDACL,&pNewDACL); 
+  if (pNewDACL)
+  {
+    SetSecurityInfo(hDir,SE_FILE_OBJECT,DACL_SECURITY_INFORMATION,NULL, NULL,
+      pNewDACL, NULL);
+  }
+  if (pSD != NULL) 
+    LocalFree((HLOCAL) pSD); 
+  if (pNewDACL != NULL) 
+    LocalFree((HLOCAL) pNewDACL);
+  CloseHandle(hDir); 
+  return 0;
+}
+
+
+/* 
+  Give directory permissions for special service user NT SERVICE\servicename
+  this user is available only on Win7 and later.
+*/
+
+void grant_directory_permissions_to_service()
+{
+  char service_user[MAX_PATH+ 12];
+  OSVERSIONINFO info;
+  info.dwOSVersionInfoSize= sizeof(info);
+  GetVersionEx(&info);
+  if (info.dwMajorVersion >6 || 
+    (info.dwMajorVersion== 6 && info.dwMinorVersion > 0)
+    && opt_service)
+  {
+    my_snprintf(service_user,sizeof(service_user), "NT SERVICE\\%s", 
+      opt_service);
+    set_directory_permissions(opt_datadir, service_user);
+  }
+}
+
+
+/* Create database instance (including registering as service etc) .*/
+
+static int create_db_instance()
+{
+  int ret= 0;
+  char cwd[MAX_PATH];
+  DWORD cwd_len= MAX_PATH;
+  char cmdline[3*MAX_PATH];
+  FILE *in;
+
+  verbose("Running bootstrap");
+
+  GetCurrentDirectory(cwd_len, cwd);
+  CreateDirectory(opt_datadir, NULL); /*ignore error, it might already exist */
+
+  if (!SetCurrentDirectory(opt_datadir))
+  {
+    die("Cannot set current directory to '%s'\n",opt_datadir);
+    return -1;
+  }
+
+  CreateDirectory("mysql",NULL);
+  CreateDirectory("test", NULL);
+
+  /*
+    Set data directory permissions for both current user and 
+    default_os_user (the one who runs services).
+  */
+  set_directory_permissions(opt_datadir, NULL);
+  set_directory_permissions(opt_datadir, default_os_user);
+
+  /* Do mysqld --bootstrap. */
+  init_bootstrap_command_line(cmdline, sizeof(cmdline));
+  /* verbose("Executing %s", cmdline); */
+
+  in= popen(cmdline, "wt");
+  if (!in)
+    goto end;
+
+  if (fwrite("use mysql;\n",11,1, in) != 1)
+  {
+    verbose("ERROR: Cannot write to mysqld's stdin");
+    ret= 1;
+    goto end;
+  }
+
+  /* Write the bootstrap script to stdin. */
+  if (fwrite(mysql_bootstrap_sql, strlen(mysql_bootstrap_sql), 1, in) != 1)
+  {
+    verbose("ERROR: Cannot write to mysqld's stdin");
+    ret= 1;
+    goto end;
+  }
+
+  /* Remove default user, if requested. */
+  if (!opt_default_user)
+  {
+    verbose("Removing default user",remove_default_user_cmd);
+    fputs(remove_default_user_cmd, in);
+    fflush(in);
+  }
+
+  if (opt_allow_remote_root_access)
+  {
+     verbose("Allowing remote access for user root",remove_default_user_cmd);
+     fputs(allow_remote_root_access_cmd,in);
+     fflush(in);
+  }
+
+  /* Change root password if requested. */
+  if (opt_password)
+  {
+    verbose("Changing root password",remove_default_user_cmd);
+    fputs(update_root_passwd_part1, in);
+    fputs(opt_password, in);
+    fputs(update_root_passwd_part2, in);
+    fflush(in);
+  }
+
+  /*
+    On some reason, bootstrap chokes if last command sent via stdin ends with 
+    newline, so we supply a dummy comment, that does not end with newline.
+  */
+  fputs(end_of_script, in);
+  fflush(in);
+
+  /* Check if bootstrap has completed successfully. */
+  ret= pclose(in);
+  if (ret)
+  {
+    verbose("mysqld returned error %d in pclose",ret);
+    goto end;
+  }
+
+  /* Create my.ini file in data directory.*/
+  ret= create_myini();
+  if (ret)
+    goto end;
+
+  /* Register service if requested. */
+  if (opt_service && opt_service[0])
+  {
+    ret= register_service();
+    grant_directory_permissions_to_service();
+    if (ret)
+      goto end;
+  }
+
+end:
+  if (ret)
+  {
+    SetCurrentDirectory(cwd);
+    clean_directory(opt_datadir);
+  }
+  return ret;
+}
diff --git a/sql/mysql_upgrade_service.cc b/sql/mysql_upgrade_service.cc
new file mode 100644
index 00000000000..db916101eb1
--- /dev/null
+++ b/sql/mysql_upgrade_service.cc
@@ -0,0 +1,522 @@
+/* Copyright (C) 2010-2011 Monty Program Ab & Vladislav Vaintroub
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  mysql_upgrade_service upgrades mysql service on Windows.
+  It changes service definition to point to the new mysqld.exe, restarts the 
+  server and runs mysql_upgrade
+*/
+
+#define DONT_DEFINE_VOID
+#include <process.h>
+#include <my_global.h>
+#include <my_getopt.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include <mysql_version.h>
+#include <winservice.h>
+
+#include <windows.h>
+
+/* We're using version APIs */
+#pragma comment(lib, "version")
+
+#define USAGETEXT \
+"mysql_upgrade_service.exe  Ver 1.00 for Windows\n" \
+"Copyright (C) 2010-2011 Monty Program Ab & Vladislav Vaintroub" \
+"This software comes with ABSOLUTELY NO WARRANTY. This is free software,\n" \
+"and you are welcome to modify and redistribute it under the GPL v2 license\n" \
+"Usage: mysql_upgrade_service.exe [OPTIONS]\n" \
+"OPTIONS:"
+
+static char mysqld_path[MAX_PATH];
+static char mysqladmin_path[MAX_PATH];
+static char mysqlupgrade_path[MAX_PATH];
+
+static char defaults_file_param[MAX_PATH + 16]; /*--defaults-file=<path> */
+static char logfile_path[MAX_PATH];
+static char *opt_service;
+static SC_HANDLE service;
+static SC_HANDLE scm;
+HANDLE mysqld_process; // mysqld.exe started for upgrade
+DWORD initial_service_state= -1; // initial state of the service
+HANDLE logfile_handle;
+
+/*
+  Startup and shutdown timeouts, in seconds. 
+  Maybe,they can be made parameters
+*/
+static unsigned int startup_timeout= 60;
+static unsigned int shutdown_timeout= 60;
+
+static struct my_option my_long_options[]=
+{
+  {"help", '?', "Display this help message and exit.", 0, 0, 0, GET_NO_ARG,
+   NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"service", 'S', "Name of the existing Windows service",
+  &opt_service, &opt_service, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+
+static my_bool
+get_one_option(int optid, 
+   const struct my_option *opt __attribute__ ((unused)),
+   char *argument __attribute__ ((unused)))
+{
+  DBUG_ENTER("get_one_option");
+  switch (optid) {
+  case '?':
+    printf("%s\n", USAGETEXT);
+    my_print_help(my_long_options);
+    exit(0);
+    break;
+  }
+  DBUG_RETURN(0);
+}
+
+
+
+static void log(const char *fmt, ...)
+{
+  va_list args;
+  /* Print the error message */
+  va_start(args, fmt);
+  vfprintf(stdout,fmt, args);
+  va_end(args);
+  fputc('\n', stdout);
+  fflush(stdout);
+}
+
+
+static void die(const char *fmt, ...)
+{
+  va_list args;
+  DBUG_ENTER("die");
+
+  /* Print the error message */
+  va_start(args, fmt);
+
+  fprintf(stderr, "FATAL ERROR: ");
+  vfprintf(stderr, fmt, args);
+  if (logfile_path[0])
+  {
+    fprintf(stderr, "Additional information can be found in the log file %s",
+      logfile_path);
+  }
+  va_end(args);
+  fputc('\n', stderr);
+  fflush(stdout);
+  /* Cleanup */
+
+  /*
+    Stop service that we started, if it was not initally running at
+    program start.
+  */
+  if (initial_service_state != -1 && initial_service_state != SERVICE_RUNNING)
+  {
+    SERVICE_STATUS service_status;
+    ControlService(service, SERVICE_CONTROL_STOP, &service_status);
+  }
+
+  if (scm)
+    CloseServiceHandle(scm);
+  if (service)
+    CloseServiceHandle(service);
+  /* Stop mysqld.exe, if it was started for upgrade */
+  if (mysqld_process)
+    TerminateProcess(mysqld_process, 3);
+  if (logfile_handle)
+    CloseHandle(logfile_handle);
+  my_end(0);
+
+  exit(1);
+}
+
+
+/*
+  spawn-like function to run subprocesses. 
+  We also redirect the full output to the log file.
+
+  Typical usage could be something like
+  run_tool(P_NOWAIT, "cmd.exe", "/c" , "echo", "foo", NULL)
+  
+  @param    wait_flag (P_WAIT or P_NOWAIT)
+  @program  program to run
+
+  Rest of the parameters is NULL terminated strings building command line.
+
+  @return intptr containing either process handle, if P_NOWAIT is used
+  or return code of the process (if P_WAIT is used)
+*/
+
+static intptr_t run_tool(int wait_flag, const char *program,...)
+{
+  static char cmdline[32*1024];
+  char *end;
+  va_list args;
+  va_start(args, program);
+  if (!program)
+    die("Invalid call to run_tool");
+  end= strxmov(cmdline, "\"", program, "\"", NullS);
+
+  for(;;) 
+  {
+    char *param= va_arg(args,char *);
+    if(!param)
+      break;
+    end= strxmov(end, " \"", param, "\"", NullS);
+  }
+  va_end(args);
+  
+  /* Create output file if not alredy done */
+  if (!logfile_handle)
+  {
+    char tmpdir[FN_REFLEN];
+    GetTempPath(FN_REFLEN, tmpdir);
+    sprintf_s(logfile_path, "%s\\mysql_upgrade_service.%s.log", tmpdir, 
+      opt_service);
+    logfile_handle= CreateFile(logfile_path, GENERIC_WRITE,  FILE_SHARE_READ, 
+      NULL, TRUNCATE_EXISTING, 0, NULL);
+    if (!logfile_handle)
+    {
+      die("Cannot open log file %s, windows error %u", 
+        logfile_path, GetLastError());
+    }
+  }
+
+  /* Start child process */
+  STARTUPINFO si= {0};
+  si.cb= sizeof(si);
+  si.hStdInput= GetStdHandle(STD_INPUT_HANDLE);
+  si.hStdError= logfile_handle;
+  si.hStdOutput= logfile_handle;
+  si.dwFlags= STARTF_USESTDHANDLES;
+  PROCESS_INFORMATION pi;
+  if (!CreateProcess(NULL, cmdline, NULL, 
+       NULL, TRUE, NULL, NULL, NULL, &si, &pi))
+  {
+    die("CreateProcess failed (commandline %s)", cmdline);
+  }
+  CloseHandle(pi.hThread);
+
+  if (wait_flag == P_NOWAIT)
+  {
+    /* Do not wait for process to complete, return handle. */
+    return (intptr_t)pi.hProcess;
+  }
+
+  /* Wait for process to complete. */
+  if (WaitForSingleObject(pi.hProcess, INFINITE) != WAIT_OBJECT_0)
+  {
+    die("WaitForSingleObject() failed");
+  }
+  DWORD exit_code;
+  if (!GetExitCodeProcess(pi.hProcess, &exit_code))
+  {
+    die("GetExitCodeProcess() failed");
+  }
+  return (intptr_t)exit_code;
+}
+
+
+void stop_mysqld_service()
+{
+  DWORD needed;
+  SERVICE_STATUS_PROCESS ssp;
+  int timeout= shutdown_timeout*1000; 
+  for(;;)
+  {
+    if (!QueryServiceStatusEx(service, SC_STATUS_PROCESS_INFO,
+          (LPBYTE)&ssp, 
+          sizeof(SERVICE_STATUS_PROCESS),
+          &needed))
+    {
+      die("QueryServiceStatusEx failed (%u)\n", GetLastError()); 
+    }
+
+    /*
+      Remeber initial state of the service, we will restore it on
+      exit.
+    */
+    if(initial_service_state == -1)
+      initial_service_state= ssp.dwCurrentState;
+
+    switch(ssp.dwCurrentState)
+    {
+      case SERVICE_STOPPED:
+        return;
+      case SERVICE_RUNNING:
+        if(!ControlService(service, SERVICE_CONTROL_STOP, 
+             (SERVICE_STATUS *)&ssp))
+            die("ControlService failed, error %u\n", GetLastError());
+      case SERVICE_START_PENDING:
+      case SERVICE_STOP_PENDING:
+        if(timeout < 0)
+          die("Service does not stop after %d seconds timeout",shutdown_timeout);
+        Sleep(100);
+        timeout -= 100;
+        break;
+      default:
+        die("Unexpected service state %d",ssp.dwCurrentState);
+    }
+  }
+}
+
+
+/* 
+  Shutdown mysql server. Not using mysqladmin, since 
+  our --skip-grant-tables do not work anymore after mysql_upgrade
+  that does "flush privileges". Instead, the shutdown event  is set.
+*/
+void initiate_mysqld_shutdown()
+{
+  char event_name[32];
+  DWORD pid= GetProcessId(mysqld_process);
+  sprintf_s(event_name, "MySQLShutdown%d", pid);
+  HANDLE shutdown_handle= OpenEvent(EVENT_MODIFY_STATE, FALSE, event_name);
+  if(!shutdown_handle)
+  {
+    die("OpenEvent() failed for shutdown event");
+  }
+
+  if(!SetEvent(shutdown_handle))
+  {
+    die("SetEvent() failed");
+  }
+}
+
+
+/*
+  Change service configuration (binPath) to point to mysqld from 
+  this installation.
+*/
+static void change_service_config()
+{
+
+  char defaults_file[MAX_PATH];
+  char default_character_set[64];
+  char buf[MAX_PATH];
+  char commandline[3*MAX_PATH + 19];
+  int i;
+
+  scm= OpenSCManager(NULL, NULL, SC_MANAGER_ALL_ACCESS);
+  if(!scm)
+    die("OpenSCManager failed with %u", GetLastError());
+  service= OpenService(scm, opt_service, SERVICE_ALL_ACCESS);
+  if (!service)
+    die("OpenService failed with %u", GetLastError());
+
+  BYTE config_buffer[8*1024];
+  LPQUERY_SERVICE_CONFIGW config= (LPQUERY_SERVICE_CONFIGW)config_buffer;
+  DWORD size= sizeof(config_buffer);
+  DWORD needed;
+  if (!QueryServiceConfigW(service, config, size, &needed))
+    die("QueryServiceConfig failed with %u", GetLastError());
+
+  mysqld_service_properties props;
+  if (get_mysql_service_properties(config->lpBinaryPathName, &props))
+  {
+    die("Not a valid MySQL service");
+  }
+
+  int my_major= MYSQL_VERSION_ID/10000;
+  int my_minor= (MYSQL_VERSION_ID %10000)/100;
+  int my_patch= MYSQL_VERSION_ID%100;
+
+  if(my_major < props.version_major || 
+    (my_major == props.version_major && my_minor < props.version_minor))
+  {
+    die("Can not downgrade, the service is currently running as version %d.%d.%d"
+      ", my version is %d.%d.%d", props.version_major, props.version_minor, 
+      props.version_patch, my_major, my_minor, my_patch);
+  }
+
+  if(props.inifile[0] == 0)
+  {
+    /*
+      Weird case, no --defaults-file in service definition, need to create one.
+    */
+    sprintf_s(props.inifile, MAX_PATH, "%s\\my.ini", props.datadir);
+  }
+
+  /*
+    Write datadir to my.ini, after converting  backslashes to 
+    unix style slashes.
+  */
+  strcpy_s(buf, MAX_PATH, props.datadir);
+  for(i= 0; buf[i]; i++)
+  {
+    if (buf[i] == '\\')
+      buf[i]= '/';
+  }
+  WritePrivateProfileString("mysqld", "datadir",buf, props.inifile);
+
+  /*
+    Remove basedir from defaults file, otherwise the service wont come up in 
+    the new version, and will complain about mismatched message file.
+  */
+  WritePrivateProfileString("mysqld", "basedir",NULL, props.inifile);
+
+  /* 
+    Replace default-character-set  with character-set-server, to avoid 
+    "default-character-set is deprecated and will be replaced ..."
+    message.
+  */
+  default_character_set[0]= 0;
+  GetPrivateProfileString("mysqld", "default-character-set", NULL,
+    default_character_set, sizeof(default_character_set), defaults_file);
+  if (default_character_set[0])
+  {
+    WritePrivateProfileString("mysqld", "default-character-set", NULL, 
+      defaults_file);
+    WritePrivateProfileString("mysqld", "character-set-server",
+      default_character_set, defaults_file);
+  }
+
+  sprintf(defaults_file_param,"--defaults-file=%s", props.inifile);
+  sprintf_s(commandline, "\"%s\" \"%s\" \"%s\"", mysqld_path, 
+   defaults_file_param, opt_service);
+  if (!ChangeServiceConfig(service, SERVICE_NO_CHANGE, SERVICE_NO_CHANGE, 
+         SERVICE_NO_CHANGE, commandline, NULL, NULL, NULL, NULL, NULL, NULL))
+  {
+    die("ChangeServiceConfig failed with %u", GetLastError());
+  }
+
+}
+
+
+int main(int argc, char **argv)
+{
+  int error;
+  MY_INIT(argv[0]);
+  char bindir[FN_REFLEN];
+  char *p;
+
+  /* Parse options */
+  if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
+    die("");
+  if (!opt_service)
+    die("--service=# parameter is mandatory");
+ 
+ /*
+    Get full path to mysqld, we need it when changing service configuration.
+    Assume installation layout, i.e mysqld.exe, mysqladmin.exe, mysqlupgrade.exe
+    and mysql_upgrade_service.exe are in the same directory.
+  */
+  GetModuleFileName(NULL, bindir, FN_REFLEN);
+  p= strrchr(bindir, FN_LIBCHAR);
+  if(p)
+  {
+    *p= 0;
+  }
+  sprintf_s(mysqld_path, "%s\\mysqld.exe", bindir);
+  sprintf_s(mysqladmin_path, "%s\\mysqladmin.exe", bindir);
+  sprintf_s(mysqlupgrade_path, "%s\\mysql_upgrade.exe", bindir);
+
+  char *paths[]= {mysqld_path, mysqladmin_path, mysqlupgrade_path};
+  for(int i= 0; i< 3;i++)
+  {
+    if(GetFileAttributes(paths[i]) == INVALID_FILE_ATTRIBUTES)
+      die("File %s does not exist", paths[i]);
+  }
+
+  /*
+    Messages written on stdout should not be buffered,  GUI upgrade program 
+    reads them from pipe and uses as progress indicator.
+  */
+  setvbuf(stdout, NULL, _IONBF, 0);
+
+  log("Phase 1/8: Changing service configuration");
+  change_service_config();
+
+  log("Phase 2/8: Stopping service");
+  stop_mysqld_service();
+
+  /* 
+    Start mysqld.exe as non-service skipping privileges (so we do not 
+    care about the password). But disable networking and enable pipe 
+    for communication, for security reasons.
+  */
+  char socket_param[FN_REFLEN];
+  sprintf_s(socket_param,"--socket=mysql_upgrade_service_%d", 
+    GetCurrentProcessId());
+
+  log("Phase 3/8: Starting mysqld for upgrade");
+  mysqld_process= (HANDLE)run_tool(P_NOWAIT, mysqld_path,
+    defaults_file_param, "--skip-networking",  "--skip-grant-tables", 
+    "--enable-named-pipe",  socket_param, NULL);
+
+  if (mysqld_process == INVALID_HANDLE_VALUE)
+  {
+    die("Cannot start mysqld.exe process, errno=%d", errno);
+  }
+
+  log("Phase 4/8: Waiting for startup to complete");
+  DWORD start_duration_ms= 0;
+  for(;;)
+  {
+    if (WaitForSingleObject(mysqld_process, 0) != WAIT_TIMEOUT)
+      die("mysqld.exe did not start");
+
+    if (run_tool(P_WAIT, mysqladmin_path, "--protocol=pipe",
+      socket_param, "ping",  NULL) == 0)
+    {
+      break;
+    }
+    if (start_duration_ms > startup_timeout*1000)
+      die("Server did not come up in %d seconds",startup_timeout);
+    Sleep(500);
+    start_duration_ms+= 500;
+  }
+
+  log("Phase 5/8: Running mysql_upgrade");
+  int upgrade_err= (int) run_tool(P_WAIT,  mysqlupgrade_path, 
+    "--protocol=pipe", "--force",  socket_param,
+    NULL);
+
+  if (upgrade_err)
+    die("mysql_upgrade failed with error code %d\n", upgrade_err);
+
+  log("Phase 6/8: Initiating server shutdown");
+  initiate_mysqld_shutdown();
+
+  log("Phase 7/8: Waiting for shutdown to complete");
+  if (WaitForSingleObject(mysqld_process, shutdown_timeout*1000)
+      != WAIT_OBJECT_0)
+  {
+    /* Shutdown takes too long */
+    die("mysqld does not shutdown.");
+  }
+  CloseHandle(mysqld_process);
+  mysqld_process= NULL;
+
+  log("Phase 8/8: Starting service%s",
+    (initial_service_state == SERVICE_RUNNING)?"":" (skipped)");
+  if (initial_service_state == SERVICE_RUNNING)
+  {
+    StartService(service, NULL, NULL);
+  }
+
+  log("Service '%s' successfully upgraded.\nLog file is written to %s",
+    opt_service, logfile_path);
+  CloseServiceHandle(service);
+  CloseServiceHandle(scm);
+  if (logfile_handle)
+    CloseHandle(logfile_handle);
+  my_end(0);
+  exit(0);
+}
+\ No newline at end of file
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 6f5d3466472..0df4a655191 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -298,14 +298,15 @@ static my_bool opt_autocommit; ///< for --autocommit command-line option
 /*
   Used with --help for detailed option
 */
-static my_bool opt_help= 0, opt_verbose= 0;
+static my_bool opt_verbose= 0;
 
-arg_cmp_func Arg_comparator::comparator_matrix[5][2] =
+arg_cmp_func Arg_comparator::comparator_matrix[6][2] =
 {{&Arg_comparator::compare_string,     &Arg_comparator::compare_e_string},
  {&Arg_comparator::compare_real,       &Arg_comparator::compare_e_real},
  {&Arg_comparator::compare_int_signed, &Arg_comparator::compare_e_int},
  {&Arg_comparator::compare_row,        &Arg_comparator::compare_e_row},
- {&Arg_comparator::compare_decimal,    &Arg_comparator::compare_e_decimal}};
+ {&Arg_comparator::compare_decimal,    &Arg_comparator::compare_e_decimal},
+ {&Arg_comparator::compare_datetime,   &Arg_comparator::compare_e_datetime}};
 
 /* static variables */
 
@@ -332,6 +333,8 @@ static PSI_rwlock_key key_rwlock_openssl;
 #endif
 #endif /* HAVE_PSI_INTERFACE */
 
+#undef SAFEMALLOC
+
 /* the default log output is log tables */
 static bool lower_case_table_names_used= 0;
 static bool max_long_data_size_used= false;
@@ -360,9 +363,9 @@ static mysql_cond_t COND_thread_cache, COND_flush_thread_cache;
 /* Global variables */
 
 bool opt_bin_log, opt_ignore_builtin_innodb= 0;
-my_bool opt_log, opt_slow_log;
+my_bool opt_log, opt_slow_log, debug_assert_if_crashed_table= 0, opt_help= 0;
 ulonglong log_output_options;
-my_bool opt_userstat_running;
+my_bool opt_userstat_running, opt_thread_alarm;
 my_bool opt_log_queries_not_using_indexes= 0;
 bool opt_error_log= IF_WIN(1,0);
 bool opt_disable_networking=0, opt_skip_show_db=0;
@@ -374,6 +377,7 @@ my_bool locked_in_memory;
 bool opt_using_transactions;
 bool volatile abort_loop;
 bool volatile shutdown_in_progress;
+uint volatile global_disable_checkpoint;
 /*
   True if the bootstrap thread is running. Protected by LOCK_thread_count,
   just like thread_count.
@@ -405,14 +409,9 @@ my_bool opt_local_infile, opt_slave_compressed_protocol;
 my_bool opt_safe_user_create = 0;
 my_bool opt_show_slave_auth_info;
 my_bool opt_log_slave_updates= 0;
+my_bool opt_replicate_annotate_rows_events= 0;
 char *opt_slave_skip_errors;
 
-/**
-  compatibility option:
-    - index usage hints (USE INDEX without a FOR clause) behave as in 5.0
-*/
-my_bool old_mode;
-
 /*
   Legacy global handlerton. These will be removed (please do not add more).
 */
@@ -428,6 +427,7 @@ my_bool opt_secure_auth= 0;
 char* opt_secure_file_priv;
 my_bool opt_log_slow_admin_statements= 0;
 my_bool opt_log_slow_slave_statements= 0;
+my_bool opt_query_cache_strip_comments = 0;
 my_bool lower_case_file_system= 0;
 my_bool opt_large_pages= 0;
 my_bool opt_super_large_pages= 0;
@@ -447,6 +447,8 @@ my_bool opt_noacl;
 my_bool sp_automatic_privileges= 1;
 
 ulong opt_binlog_rows_event_max_size;
+my_bool opt_master_verify_checksum= 0;
+my_bool opt_slave_sql_verify_checksum= 1;
 const char *binlog_format_names[]= {"MIXED", "STATEMENT", "ROW", NullS};
 #ifdef HAVE_INITGROUPS
 static bool calling_initgroups= FALSE; /**< Used in SIGSEGV handler. */
@@ -490,6 +492,7 @@ ulong binlog_stmt_cache_use= 0, binlog_stmt_cache_disk_use= 0;
 ulong max_connections, max_connect_errors;
 ulong extra_max_connections;
 ulonglong denied_connections;
+my_decimal decimal_zero;
 
 /*
   Maximum length of parameter value which can be set through
@@ -557,9 +560,8 @@ const double log_10[] = {
 time_t server_start_time, flush_status_time;
 
 char mysql_home[FN_REFLEN], pidfile_name[FN_REFLEN], system_time_zone[30];
-char default_logfile_name[FN_REFLEN];
 char *default_tz_name;
-char log_error_file[FN_REFLEN], glob_hostname[FN_REFLEN];
+char log_error_file[FN_REFLEN], glob_hostname[FN_REFLEN], *opt_log_basename;
 char mysql_real_data_home[FN_REFLEN],
      lc_messages_dir[FN_REFLEN], reg_ext[FN_EXTLEN],
      mysql_charsets_dir[FN_REFLEN],
@@ -587,7 +589,6 @@ const char *in_left_expr_name= "<left expr>";
 const char *in_additional_cond= "<IN COND>";
 const char *in_having_cond= "<IN HAVING>";
 
-my_decimal decimal_zero;
 /* classes for comparation parsing/processing */
 Eq_creator eq_creator;
 Ne_creator ne_creator;
@@ -629,7 +630,7 @@ pthread_key(MEM_ROOT**,THR_MALLOC);
 pthread_key(THD*, THR_THD);
 mysql_mutex_t LOCK_thread_count;
 mysql_mutex_t
-  LOCK_status, LOCK_error_log, LOCK_uuid_generator,
+  LOCK_status, LOCK_error_log, LOCK_short_uuid_generator,
   LOCK_delayed_insert, LOCK_delayed_status, LOCK_delayed_create,
   LOCK_crypt,
   LOCK_global_system_variables,
@@ -673,10 +674,11 @@ char *opt_logname, *opt_slow_logname;
 /* Static variables */
 
 static bool kill_in_progress, segfaulted;
+static my_bool opt_stack_trace;
 static my_bool opt_bootstrap, opt_myisam_log;
 static int cleanup_done;
 static ulong opt_specialflag;
-static char *opt_update_logname, *opt_binlog_index_name;
+static char *opt_binlog_index_name;
 char *mysql_home_ptr, *pidfile_name_ptr;
 /** Initial command line arguments (count), after load_defaults().*/
 static int defaults_argc;
@@ -727,7 +729,10 @@ PSI_mutex_key key_RELAYLOG_LOCK_index;
 
 PSI_mutex_key key_LOCK_stats,
   key_LOCK_global_user_client_stats, key_LOCK_global_table_stats,
-  key_LOCK_global_index_stats;
+  key_LOCK_global_index_stats,
+  key_LOCK_wakeup_ready;
+
+PSI_mutex_key key_LOCK_prepare_ordered, key_LOCK_commit_ordered;
 
 static PSI_mutex_info all_server_mutexes[]=
 {
@@ -767,9 +772,10 @@ static PSI_mutex_info all_server_mutexes[]=
   { &key_LOCK_global_user_client_stats, "LOCK_global_user_client_stats", PSI_FLAG_GLOBAL},
   { &key_LOCK_global_table_stats, "LOCK_global_table_stats", PSI_FLAG_GLOBAL},
   { &key_LOCK_global_index_stats, "LOCK_global_index_stats", PSI_FLAG_GLOBAL},
+  { &key_LOCK_wakeup_ready, "THD::LOCK_wakeup_ready", 0},
   { &key_LOCK_thd_data, "THD::LOCK_thd_data", 0},
   { &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
-  { &key_LOCK_uuid_short_generator, "LOCK_uuid_generator", PSI_FLAG_GLOBAL},
+  { &key_LOCK_uuid_short_generator, "LOCK_uuid_short_generator", PSI_FLAG_GLOBAL},
   { &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
   { &key_master_info_data_lock, "Master_info::data_lock", 0},
   { &key_master_info_run_lock, "Master_info::run_lock", 0},
@@ -780,6 +786,8 @@ static PSI_mutex_info all_server_mutexes[]=
   { &key_structure_guard_mutex, "Query_cache::structure_guard_mutex", 0},
   { &key_TABLE_SHARE_LOCK_ha_data, "TABLE_SHARE::LOCK_ha_data", 0},
   { &key_LOCK_error_messages, "LOCK_error_messages", PSI_FLAG_GLOBAL},
+  { &key_LOCK_prepare_ordered, "LOCK_prepare_ordered", PSI_FLAG_GLOBAL},
+  { &key_LOCK_commit_ordered, "LOCK_commit_ordered", PSI_FLAG_GLOBAL},
   { &key_LOG_INFO_lock, "LOG_INFO::lock", 0},
   { &key_LOCK_thread_count, "LOCK_thread_count", PSI_FLAG_GLOBAL},
   { &key_PARTITION_LOCK_auto_inc, "HA_DATA_PARTITION::LOCK_auto_inc", 0}
@@ -803,7 +811,7 @@ static PSI_rwlock_info all_server_rwlocks[]=
 };
 
 #ifdef HAVE_MMAP
-PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
+PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool, key_COND_queue_busy;
 #endif /* HAVE_MMAP */
 
 PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
@@ -816,7 +824,7 @@ PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
   key_relay_log_info_start_cond, key_relay_log_info_stop_cond,
   key_TABLE_SHARE_cond, key_user_level_lock_cond,
   key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
-PSI_cond_key key_RELAYLOG_update_cond;
+PSI_cond_key key_RELAYLOG_update_cond, key_COND_wakeup_ready;
 
 static PSI_cond_info all_server_conds[]=
 {
@@ -831,6 +839,7 @@ static PSI_cond_info all_server_conds[]=
   { &key_BINLOG_COND_prep_xids, "MYSQL_BIN_LOG::COND_prep_xids", 0},
   { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0},
   { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
+  { &key_COND_wakeup_ready, "THD::COND_wakeup_ready", 0},
   { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
   { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
   { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
@@ -1242,7 +1251,6 @@ static void clean_up(bool print_message);
 static int test_if_case_insensitive(const char *dir_name);
 
 #ifndef EMBEDDED_LIBRARY
-static void register_mutex_order();
 static void usage(void);
 static void start_signal_handler(void);
 static void close_server_sock();
@@ -1384,9 +1392,19 @@ static void close_connections(void)
       mysql_mutex_lock(&tmp->mysys_var->mutex);
       if (tmp->mysys_var->current_cond)
       {
-        mysql_mutex_lock(tmp->mysys_var->current_mutex);
-        mysql_cond_broadcast(tmp->mysys_var->current_cond);
-        mysql_mutex_unlock(tmp->mysys_var->current_mutex);
+        uint i;
+        for (i=0; i < 2; i++)
+        {
+          int ret= mysql_mutex_trylock(tmp->mysys_var->current_mutex);
+          mysql_cond_broadcast(tmp->mysys_var->current_cond);
+          if (!ret)
+          {
+            /* Thread has surely got the signal, unlock and abort */
+            mysql_mutex_unlock(tmp->mysys_var->current_mutex);
+            break;
+          }
+          sleep(1);
+        }
       }
       mysql_mutex_unlock(&tmp->mysys_var->mutex);
     }
@@ -1397,8 +1415,9 @@ static void close_connections(void)
   Events::deinit();
   end_slave();
 
-  if (thread_count)
-    sleep(2);					// Give threads time to die
+  /* Give threads time to die. */
+  for (int i= 0; thread_count && i < 100; i++)
+    my_sleep(20000);
 
   /*
     Force remaining threads to die by closing the connection to the client
@@ -1770,7 +1789,6 @@ void clean_up(bool print_message)
   if (defaults_argv)
     free_defaults(defaults_argv);
   free_tmpdir(&mysql_tmpdir_list);
-  my_free(opt_bin_logname);
   bitmap_free(&temp_pool);
   free_max_user_conn();
   free_global_user_stats();
@@ -1780,6 +1798,7 @@ void clean_up(bool print_message)
 #ifdef HAVE_REPLICATION
   end_slave_list();
 #endif
+  my_uuid_end();
   delete binlog_filter;
   delete rpl_filter;
   end_ssl();
@@ -1842,7 +1861,7 @@ static void wait_for_signal_thread_to_end()
     my_sleep(100);				// Give it time to die
   }
 }
-
+#endif /*EMBEDDED_LIBRARY*/
 
 static void clean_up_mutexes()
 {
@@ -1867,18 +1886,18 @@ static void clean_up_mutexes()
   for (int i= 0; i < CRYPTO_num_locks(); ++i)
     mysql_rwlock_destroy(&openssl_stdlocks[i].lock);
   OPENSSL_free(openssl_stdlocks);
-#endif
-#endif
+#endif /* HAVE_YASSL */
+#endif /* HAVE_OPENSSL */
 #ifdef HAVE_REPLICATION
   mysql_mutex_destroy(&LOCK_rpl_status);
   mysql_cond_destroy(&COND_rpl_status);
-#endif
+#endif /* HAVE_REPLICATION */
   mysql_mutex_destroy(&LOCK_active_mi);
   mysql_rwlock_destroy(&LOCK_sys_init_connect);
   mysql_rwlock_destroy(&LOCK_sys_init_slave);
   mysql_mutex_destroy(&LOCK_global_system_variables);
   mysql_rwlock_destroy(&LOCK_system_variables_hash);
-  mysql_mutex_destroy(&LOCK_uuid_generator);
+  mysql_mutex_destroy(&LOCK_short_uuid_generator);
   mysql_mutex_destroy(&LOCK_prepared_stmt_count);
   mysql_mutex_destroy(&LOCK_error_messages);
   mysql_cond_destroy(&COND_thread_count);
@@ -1887,37 +1906,12 @@ static void clean_up_mutexes()
   mysql_cond_destroy(&COND_manager);
   mysql_mutex_destroy(&LOCK_server_started);
   mysql_cond_destroy(&COND_server_started);
+  mysql_mutex_destroy(&LOCK_prepare_ordered);
+  mysql_mutex_destroy(&LOCK_commit_ordered);
   DBUG_VOID_RETURN;
 }
 
 
-/**
-   Register order of mutex for wrong mutex deadlock detector
-
-   By aquiring all mutex in order here, the mutex order detector in
-   mysys/thr_mutex.c, will give a warning on first wrong mutex usage!
-*/
-
-#ifdef SAFE_MUTEX
-#define always_in_that_order(A,B)               \
-  mysql_mutex_lock(A); mysql_mutex_lock(B);     \
-  mysql_mutex_unlock(B); mysql_mutex_unlock(A)
-#else
-#define always_in_that_order(A,B)
-#endif
-
-static void register_mutex_order()
-{
-  /*
-    We must have LOCK_open before LOCK_global_system_variables because
-    LOCK_open is hold while sql_plugin.c::intern_sys_var_ptr() is called.
-  */
-  always_in_that_order(&LOCK_open, &LOCK_global_system_variables);
-}
-#undef always_in_that_order
-
-#endif /*EMBEDDED_LIBRARY*/
-
 /****************************************************************************
 ** Init IP and UNIX socket
 ****************************************************************************/
@@ -2180,9 +2174,12 @@ static my_socket activate_tcp_port(uint port)
   freeaddrinfo(ai);
   if (ret < 0)
   {
-    DBUG_PRINT("error",("Got error: %d from bind",socket_errno));
-    sql_perror("Can't start server: Bind on TCP/IP port");
-    sql_print_error("Do you already have another mysqld server running on port: %d ?",port);
+    char buff[100];
+    sprintf(buff, "Can't start server: Bind on TCP/IP port. Got error: %d",
+            (int) socket_errno);
+    sql_perror(buff);
+    sql_print_error("Do you already have another mysqld server running on "
+                    "port: %u ?", port);
     unireg_abort(1);
   }
   if (listen(ip_sock,(int) back_log) < 0)
@@ -2499,7 +2496,7 @@ static bool cache_thread()
         this thread for handling of new THD object/connection.
       */
       thd->mysys_var->abort= 0;
-      thd->thr_create_utime= my_micro_time();
+      thd->thr_create_utime= microsecond_interval_timer();
       threads.append(thd);
       return(1);
     }
@@ -2824,7 +2821,7 @@ the thread stack. Please read http://dev.mysql.com/doc/mysql/en/linux.html\n\n",
 
 #ifdef HAVE_STACKTRACE
 
-  if (!(test_flags & TEST_NO_STACKTRACE))
+  if (opt_stack_trace)
   {
     fprintf(stderr, "Thread pointer: 0x%lx\n", (long) thd);
     fprintf(stderr, "Attempting backtrace. You can use the following "
@@ -2857,10 +2854,21 @@ the thread stack. Please read http://dev.mysql.com/doc/mysql/en/linux.html\n\n",
     fprintf(stderr, "\nTrying to get some variables.\n"
                     "Some pointers may be invalid and cause the dump to abort.\n");
     fprintf(stderr, "Query (%p): ", thd->query());
-    my_safe_print_str(thd->query(), min(1024, thd->query_length()));
+    my_safe_print_str(thd->query(), min(65536,thd->query_length()));
     fprintf(stderr, "\nConnection ID (thread ID): %lu\n", (ulong) thd->thread_id);
     fprintf(stderr, "Status: %s\n", kreason);
-    fputc('\n', stderr);
+    fprintf(stderr, "Optimizer switch: ");
+
+    extern const char *optimizer_switch_names[];
+    ulonglong optsw= global_system_variables.optimizer_switch;
+    for (uint i= 0; optimizer_switch_names[i+1]; i++, optsw >>= 1)
+    {
+      if (i)
+        fputc(',', stderr);
+      fprintf(stderr, "%s=%s",
+              optimizer_switch_names[i], optsw & 1 ? "on" : "off");
+    }
+    fprintf(stderr, "\n\n");
   }
   fprintf(stderr, "\
 The manual page at http://dev.mysql.com/doc/mysql/en/crashing.html contains\n\
@@ -2936,7 +2944,7 @@ static void init_signals(void)
 
   my_sigset(THR_SERVER_ALARM,print_signal_warning); // Should never be called!
 
-  if (!(test_flags & TEST_NO_STACKTRACE) || (test_flags & TEST_CORE_ON_SIGNAL))
+  if (opt_stack_trace || (test_flags & TEST_CORE_ON_SIGNAL))
   {
     sa.sa_flags = SA_RESETHAND | SA_NODEFER;
     sigemptyset(&sa.sa_mask);
@@ -3282,6 +3290,7 @@ const char *load_default_groups[]= {
 #endif
 "mysqld", "server", MYSQL_BASE_VERSION,
 "mariadb", MARIADB_BASE_VERSION,
+"client-server",
 0, 0};
 
 #if defined(__WIN__) && !defined(EMBEDDED_LIBRARY)
@@ -3486,23 +3495,11 @@ SHOW_VAR com_status_vars[]= {
   {NullS, NullS, SHOW_LONG}
 };
 
-/**
-  Create the name of the default general log file
-  
-  @param[IN] buff    Location for building new string.
-  @param[IN] log_ext The extension for the file (e.g .log)
-  @returns Pointer to a new string containing the name
-*/
-static inline char *make_default_log_name(char *buff,const char* log_ext)
-{
-  return make_log_name(buff, default_logfile_name, log_ext);
-}
-
 static int init_common_variables()
 {
-  char buff[FN_REFLEN];
   umask(((~my_umask) & 0666));
   my_decimal_set_zero(&decimal_zero); // set decimal_zero constant;
+
   tzset();			// Set tzname
 
   max_system_variables.pseudo_thread_id= (ulong)~0;
@@ -3566,17 +3563,23 @@ static int init_common_variables()
 
   if (gethostname(glob_hostname,sizeof(glob_hostname)) < 0)
   {
+    /*
+      Get hostname of computer (used by 'show variables') and as default
+      basename for the pid file if --log-basename is not given.
+    */
     strmake(glob_hostname, STRING_WITH_LEN("localhost"));
     sql_print_warning("gethostname failed, using '%s' as hostname",
-                      glob_hostname);
-    strmake(default_logfile_name, STRING_WITH_LEN("mysql"));
+                        glob_hostname);
+    opt_log_basename= const_cast<char *>("mysql");
   }
   else
-    strmake(default_logfile_name, glob_hostname, 
-	    sizeof(default_logfile_name)-5);
+    opt_log_basename= glob_hostname;
 
-  strmake(pidfile_name, default_logfile_name, sizeof(pidfile_name)-5);
-  strmov(fn_ext(pidfile_name),".pid");		// Add proper extension
+  if (!*pidfile_name)
+  {
+    strmake(pidfile_name, opt_log_basename, sizeof(pidfile_name)-5);
+    strmov(fn_ext(pidfile_name),".pid");		// Add proper extension
+  }
 
   /*
     The default-storage-engine entry in my_long_options should have a
@@ -3837,6 +3840,7 @@ static int init_common_variables()
   global_system_variables.collation_connection=  default_charset_info;
   global_system_variables.character_set_results= default_charset_info;
   global_system_variables.character_set_client=  default_charset_info;
+
   if (!(character_set_filesystem=
         get_charset_by_csname(character_set_filesystem_name,
                               MY_CS_PRIMARY, MYF(MY_WME))))
@@ -3864,16 +3868,10 @@ static int init_common_variables()
                       "--log-slow-queries option, log tables are used. "
                       "To enable logging to files use the --log-output=file option.");
 
-#define FIX_LOG_VAR(VAR, ALT)                                   \
-  if (!VAR || !*VAR)                                            \
-  {                                                             \
-    my_free(VAR); /* it could be an allocated empty string "" */ \
-    VAR= my_strdup(ALT, MYF(0));                                \
-  }
-  FIX_LOG_VAR(opt_logname,
-              make_default_log_name(buff, ".log"));
-  FIX_LOG_VAR(opt_slow_logname,
-              make_default_log_name(buff, "-slow.log"));
+  if (!opt_logname || !*opt_logname)
+    make_default_log_name(&opt_logname, ".log", false);
+  if (!opt_slow_logname || !*opt_slow_logname)
+    make_default_log_name(&opt_slow_logname, "-slow.log", false);
 
 #if defined(ENABLED_DEBUG_SYNC)
   /* Initialize the debug sync facility. See debug_sync.cc. */
@@ -3918,8 +3916,7 @@ You should consider changing lower_case_table_names to 1 or 2",
     }
   }
   else if (lower_case_table_names == 2 &&
-           !(lower_case_file_system=
-             (test_if_case_insensitive(mysql_real_data_home) == 1)))
+           !(lower_case_file_system= (lower_case_file_system == 1)))
   {
     if (global_system_variables.log_warnings)
       sql_print_warning("lower_case_table_names was set to 2, even though your "
@@ -3930,8 +3927,7 @@ You should consider changing lower_case_table_names to 1 or 2",
   }
   else
   {
-    lower_case_file_system=
-      (test_if_case_insensitive(mysql_real_data_home) == 1);
+    lower_case_file_system= (lower_case_file_system == 1);
   }
 
   /* Reset table_alias_charset, now that lower_case_table_names is set. */
@@ -3967,7 +3963,7 @@ static int init_thread_environment()
   mysql_mutex_init(key_LOCK_error_messages,
                    &LOCK_error_messages, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_uuid_short_generator,
-                   &LOCK_uuid_generator, MY_MUTEX_INIT_FAST);
+                   &LOCK_short_uuid_generator, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_connection_count,
                    &LOCK_connection_count, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_stats, &LOCK_stats, MY_MUTEX_INIT_FAST);
@@ -3977,6 +3973,10 @@ static int init_thread_environment()
                    &LOCK_global_table_stats, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_global_index_stats,
                    &LOCK_global_index_stats, MY_MUTEX_INIT_FAST);
+  mysql_mutex_init(key_LOCK_prepare_ordered, &LOCK_prepare_ordered,
+                   MY_MUTEX_INIT_SLOW);
+  mysql_mutex_init(key_LOCK_commit_ordered, &LOCK_commit_ordered,
+                   MY_MUTEX_INIT_SLOW);
 
 #ifdef HAVE_OPENSSL
   mysql_mutex_init(key_LOCK_des_key_file,
@@ -4143,6 +4143,32 @@ static void end_ssl()
 #endif /* HAVE_OPENSSL */
 }
 
+#ifdef _WIN32
+/**
+  Registers a file to be collected when Windows Error Reporting creates a crash 
+  report.
+
+  @note only works on Vista and later, since WerRegisterFile() is not available
+  on earlier Windows.
+*/
+#include <werapi.h>
+static void add_file_to_crash_report(char *file)
+{
+  /* Load WerRegisterFile function dynamically.*/
+  HRESULT (WINAPI *pWerRegisterFile)(PCWSTR, WER_REGISTER_FILE_TYPE, DWORD)
+    =(HRESULT (WINAPI *) (PCWSTR, WER_REGISTER_FILE_TYPE, DWORD))
+    GetProcAddress(GetModuleHandle("kernel32"),"WerRegisterFile");
+
+  if (pWerRegisterFile)
+  {
+    wchar_t wfile[MAX_PATH+1]= {0};
+    if (mbstowcs(wfile, file, MAX_PATH) != (size_t)-1)
+    {
+      pWerRegisterFile(wfile, WerRegFileTypeOther, WER_FILE_ANONYMOUS_DATA);
+    }
+  }
+}
+#endif
 
 static int init_server_components()
 {
@@ -4200,6 +4226,11 @@ static int init_server_components()
 
       if (!res)
         setbuf(stderr, NULL);
+
+#ifdef _WIN32
+      /* Add error log to windows crash reporting. */
+      add_file_to_crash_report(log_error_file);
+#endif
     }
   }
 
@@ -4306,8 +4337,7 @@ a file name for --log-bin-index option", opt_binlog_index_name);
     }
     if (ln == buf)
     {
-      my_free(opt_bin_logname);
-      opt_bin_logname=my_strdup(buf, MYF(0));
+      opt_bin_logname= my_once_strdup(buf, MYF(MY_WME));
     }
     if (mysql_bin_log.open_index_file(opt_binlog_index_name, ln, TRUE))
     {
@@ -4863,13 +4893,14 @@ int mysqld_main(int argc, char **argv)
 
 #ifndef DBUG_OFF
   test_lc_time_sz();
+  srand((uint) time(NULL)); 
 #endif
 
   /*
     We have enough space for fiddling with the argv, continue
   */
   check_data_home(mysql_real_data_home);
-  if (my_setwd(mysql_real_data_home,MYF(MY_WME)) && !opt_help)
+  if (my_setwd(mysql_real_data_home, opt_help ? 0 : MYF(MY_WME)) && !opt_help)
     unireg_abort(1);				/* purecov: inspected */
 
   if ((user_info= check_user(mysqld_user)))
@@ -5004,7 +5035,11 @@ int mysqld_main(int argc, char **argv)
       unireg_abort(1);
   }
 
-  register_mutex_order();
+  /*
+    We must have LOCK_open before LOCK_global_system_variables because
+    LOCK_open is hold while sql_plugin.c::intern_sys_var_ptr() is called.
+  */
+  mysql_mutex_record_order(&LOCK_open, &LOCK_global_system_variables);
 
   create_shutdown_thread();
   start_handle_manager();
@@ -5377,7 +5412,7 @@ void handle_connection_in_main_thread(THD *thd)
   thread_cache_size=0;			// Safety
   threads.append(thd);
   mysql_mutex_unlock(&LOCK_thread_count);
-  thd->start_utime= my_micro_time();
+  thd->start_utime= microsecond_interval_timer();
   do_handle_one_connection(thd);
 }
 
@@ -5403,7 +5438,7 @@ void create_thread_to_handle_connection(THD *thd)
     thread_created++;
     threads.append(thd);
     DBUG_PRINT("info",(("creating thread %lu"), thd->thread_id));
-    thd->prior_thr_create_utime= thd->start_utime= my_micro_time();
+    thd->prior_thr_create_utime= thd->start_utime= microsecond_interval_timer();
     if ((error= mysql_thread_create(key_thread_one_connection,
                                     &thd->real_id, &connection_attrib,
                                     handle_one_connection,
@@ -6214,9 +6249,12 @@ struct my_option my_long_options[]=
    &disconnect_slave_event_count, &disconnect_slave_event_count,
    0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
 #endif /* HAVE_REPLICATION */
+#ifdef HAVE_STACKTRACE
+  {"stack-trace", 0 , "Print a symbolic stack trace on failure",
+   &opt_stack_trace, &opt_stack_trace, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+#endif /* HAVE_STACKTRACE */
   {"exit-info", 'T', "Used for debugging. Use at your own risk.", 0, 0, 0,
    GET_LONG, OPT_ARG, 0, 0, 0, 0, 0, 0},
-
   {"external-locking", 0, "Use system (external) locking (disabled by "
    "default).  With this option enabled you can run myisamchk to test "
    "(not repair) tables while the MySQL server is running. Disable with "
@@ -6249,11 +6287,24 @@ struct my_option my_long_options[]=
   {"log", 'l', "Log connections and queries to file (deprecated option, use "
    "--general-log/--general-log-file instead).", &opt_logname, &opt_logname,
    0, GET_STR_ALLOC, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#if 0
+  {"log-basename", OPT_LOG_BASENAME,
+   "Basename for all log files and the .pid file. This sets all log file "
+   "names at once (in 'datadir') and is normally the only option you need "
+   "for specifying log files. Sets names for --log-bin, --log-bin-index, "
+   "--relay-log, --relay-log-index, --general-log-file, "
+   "--log-slow-query-log-file, --log-error-file, and --pid-file",
+   &opt_log_basename, &opt_log_basename, 0, GET_STR, REQUIRED_ARG,
+   0, 0, 0, 0, 0, 0},
+#endif
   {"log-bin", OPT_BIN_LOG,
-   "Log update queries in binary format. Optional (but strongly recommended "
-   "to avoid replication problems if server's hostname changes) argument "
-   "should be the chosen location for the binary log files.",
-   &opt_bin_logname, &opt_bin_logname, 0, GET_STR_ALLOC,
+   "Log update queries in binary format. Optional argument should be name for "
+   "binary log. If not given "
+   "datadir/'log-basename'-bin or 'datadir'/mysql-bin will be used (the later if "
+   "--log-basename is not specified). We strongly recommend to use either "
+   "--log-basename or specify a filename to ensure that replication doesn't "
+   "stop if the real hostname of the computer changes'.",
+   &opt_bin_logname, &opt_bin_logname, 0, GET_STR,
    OPT_ARG, 0, 0, 0, 0, 0, 0},
   {"log-bin-index", 0,
    "File that holds the names for last binary log files.",
@@ -6275,9 +6326,10 @@ struct my_option my_long_options[]=
   &opt_log_slow_slave_statements, &opt_log_slow_slave_statements,
   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"log-slow-queries", OPT_SLOW_QUERY_LOG,
-   "Log slow queries to a table or log file. Defaults logging to table "
-   "mysql.slow_log or hostname-slow.log if --log-output=file is used. "
-   "Must be enabled to activate other slow log options. "
+   "Enable logging of slow queries (longer than --long-query-time) to log file "
+   "or table. Optional argument is a file name for the slow log. If not given, "
+   "'log-basename'-slow.log will be used. Use --log-output=TABLE if you want "
+   "to have the log in the table 'mysql.slow_log'. "
    "Deprecated option, use --slow-query-log/--slow-query-log-file instead.",
    &opt_slow_logname, &opt_slow_logname, 0, GET_STR_ALLOC, OPT_ARG,
    0, 0, 0, 0, 0, 0},
@@ -6294,7 +6346,8 @@ struct my_option my_long_options[]=
 #endif
   {"master-info-file", 0,
    "The location and name of the file that remembers the master and where "
-   "the I/O replication thread is in the master's binlogs.",
+   "the I/O replication thread is in the master's binlogs. Defaults to "
+   "master.info",
    &master_info_file, &master_info_file, 0, GET_STR,
    REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"master-retry-count", 0,
@@ -6403,9 +6456,6 @@ struct my_option my_long_options[]=
   {"skip-slave-start", 0,
    "If set, slave is not autostarted.", &opt_skip_slave_start,
    &opt_skip_slave_start, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
-  {"skip-stack-trace", OPT_SKIP_STACK_TRACE,
-   "Don't print a stack trace on failure.", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0,
-   0, 0, 0, 0},
   {"skip-thread-priority", OPT_SKIP_PRIOR,
    "Don't give threads different priorities. This option is deprecated "
    "because it has no effect; the implied behavior is already the default.",
@@ -6486,6 +6536,12 @@ struct my_option my_long_options[]=
   {"table_cache", 0, "Deprecated; use --table-open-cache instead.",
    &table_cache_size, &table_cache_size, 0, GET_ULONG,
    REQUIRED_ARG, TABLE_OPEN_CACHE_DEFAULT, 1, 512*1024L, 0, 1, 0},
+#ifndef DBUG_OFF
+  {"debug-assert-if-crashed-table", 0,
+   "Do an assert in handler::print_error() if we get a crashed table",
+   &debug_assert_if_crashed_table, &debug_assert_if_crashed_table,
+   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
   {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
 };
 
@@ -6968,6 +7024,8 @@ SHOW_VAR status_vars[]= {
   {"Handler_savepoint_rollback",(char*) offsetof(STATUS_VAR, ha_savepoint_rollback_count), SHOW_LONG_STATUS},
   {"Handler_update",           (char*) offsetof(STATUS_VAR, ha_update_count), SHOW_LONG_STATUS},
   {"Handler_write",            (char*) offsetof(STATUS_VAR, ha_write_count), SHOW_LONG_STATUS},
+  {"Handler_tmp_update",       (char*) offsetof(STATUS_VAR, ha_tmp_update_count), SHOW_LONG_STATUS},
+  {"Handler_tmp_write",        (char*) offsetof(STATUS_VAR, ha_tmp_write_count), SHOW_LONG_STATUS},
   {"Key",                      (char*) &show_default_keycache, SHOW_FUNC},
   {"Last_query_cost",          (char*) offsetof(STATUS_VAR, last_query_cost), SHOW_DOUBLE_STATUS},
   {"Max_used_connections",     (char*) &max_used_connections,  SHOW_LONG},
@@ -6980,8 +7038,9 @@ SHOW_VAR status_vars[]= {
   {"Opened_tables",            (char*) offsetof(STATUS_VAR, opened_tables), SHOW_LONG_STATUS},
   {"Opened_table_definitions", (char*) offsetof(STATUS_VAR, opened_shares), SHOW_LONG_STATUS},
   {"Prepared_stmt_count",      (char*) &show_prepared_stmt_count, SHOW_FUNC},
-  {"Rows_sent",                (char*) offsetof(STATUS_VAR, rows_sent), SHOW_LONG_STATUS},
-  {"Rows_read",                (char*) offsetof(STATUS_VAR, rows_read), SHOW_LONG_STATUS},
+  {"Rows_sent",                (char*) offsetof(STATUS_VAR, rows_sent), SHOW_LONGLONG_STATUS},
+  {"Rows_read",                (char*) offsetof(STATUS_VAR, rows_read), SHOW_LONGLONG_STATUS},
+  {"Rows_tmp_read",            (char*) offsetof(STATUS_VAR, rows_tmp_read), SHOW_LONGLONG_STATUS},
 #ifdef HAVE_QUERY_CACHE
   {"Qcache_free_blocks",       (char*) &query_cache.free_memory_blocks, SHOW_LONG_NOFLUSH},
   {"Qcache_free_memory",       (char*) &query_cache.free_memory, SHOW_LONG_NOFLUSH},
@@ -7205,7 +7264,8 @@ static int mysql_init_variables(void)
   opt_disable_networking= opt_skip_show_db=0;
   opt_skip_name_resolve= 0;
   opt_ignore_builtin_innodb= 0;
-  opt_logname= opt_update_logname= opt_binlog_index_name= opt_slow_logname= 0;
+  opt_logname= opt_binlog_index_name= opt_slow_logname= 0;
+  opt_log_basename= 0;
   opt_tc_log_file= (char *)"tc.log";      // no hostname in tc_log file name !
   opt_secure_auth= 0;
   opt_bootstrap= opt_myisam_log= 0;
@@ -7468,6 +7528,34 @@ mysqld_get_one_option(int optid,
   case (int) OPT_BIN_LOG:
     opt_bin_log= test(argument != disabled_my_option);
     break;
+  case (int) OPT_LOG_BASENAME:
+  {
+    if (opt_log_basename[0] == 0 || strchr(opt_log_basename, FN_EXTCHAR) ||
+        strchr(opt_log_basename,FN_LIBCHAR))
+    {
+      sql_print_error("Wrong argument for --log-basename. It can't be empty or contain '.' or '" FN_DIRSEP "'");
+      return 1;
+    }
+    if (log_error_file_ptr != disabled_my_option)
+      log_error_file_ptr= opt_log_basename;
+
+    make_default_log_name(&opt_logname, ".log", false);
+    make_default_log_name(&opt_slow_logname, "-slow.log", false);
+    make_default_log_name(&opt_bin_logname, "-bin", true);
+    make_default_log_name(&opt_binlog_index_name, "-bin.index", true);
+    make_default_log_name(&opt_relay_logname, "-relay-bin", true);
+    make_default_log_name(&opt_relaylog_index_name, "-relay-bin.index", true);
+
+    pidfile_name_ptr= pidfile_name;
+    strmake(pidfile_name, argument, sizeof(pidfile_name)-5);
+    strmov(fn_ext(pidfile_name),".pid");
+
+    /* check for errors */
+    if (!opt_bin_logname || !opt_relaylog_index_name || ! opt_logname ||
+        ! opt_slow_logname || !pidfile_name_ptr)
+      return 1;                                 // out of memory error
+    break;
+  }
 #ifdef HAVE_REPLICATION
   case (int)OPT_REPLICATE_IGNORE_DB:
   {
@@ -7587,9 +7675,6 @@ mysqld_get_one_option(int optid,
   case (int) OPT_WANT_CORE:
     test_flags |= TEST_CORE_ON_SIGNAL;
     break;
-  case (int) OPT_SKIP_STACK_TRACE:
-    test_flags|=TEST_NO_STACKTRACE;
-    break;
   case (int) OPT_BIND_ADDRESS:
     {
       struct addrinfo *res_lst, hints;    
@@ -7666,7 +7751,6 @@ mysqld_get_one_option(int optid,
     break;
   case OPT_MAX_LONG_DATA_SIZE:
     max_long_data_size_used= true;
-    WARN_DEPRECATED(NULL, 5, 6, "--max_long_data_size", "'--max_allowed_packet'");
     break;
   }
   return 0;
@@ -7852,7 +7936,8 @@ static int get_options(int *argc_ptr, char ***argv_ptr)
   if (opt_debugging)
   {
     /* Allow break with SIGINT, no core or stack trace */
-    test_flags|= TEST_SIGINT | TEST_NO_STACKTRACE;
+    test_flags|= TEST_SIGINT;
+    opt_stack_trace= 1;
     test_flags&= ~TEST_CORE_ON_SIGNAL;
   }
   /* Set global MyISAM variables from delay_key_write_options */
@@ -7875,6 +7960,7 @@ static int get_options(int *argc_ptr, char ***argv_ptr)
   */
   my_disable_locking= myisam_single_user= test(opt_external_locking == 0);
   my_disable_sync= opt_sync == 0;
+  my_disable_thr_alarm= opt_thread_alarm == 0;
   my_default_record_cache_size=global_system_variables.read_buff_size;
 
   global_system_variables.long_query_time= (ulonglong)
@@ -8046,6 +8132,7 @@ static int fix_paths(void)
   (void) my_load_path(opt_plugin_dir, opt_plugin_dir_ptr ? opt_plugin_dir_ptr :
                                       get_relative_path(PLUGINDIR), mysql_home);
   opt_plugin_dir_ptr= opt_plugin_dir;
+  pidfile_name_ptr= pidfile_name;
 
   my_realpath(mysql_unpacked_real_data_home, mysql_real_data_home, MYF(0));
   mysql_unpacked_real_data_home_len= 
@@ -8134,7 +8221,8 @@ static int test_if_case_insensitive(const char *dir_name)
   if ((file= mysql_file_create(key_file_casetest,
                                buff, 0666, O_RDWR, MYF(0))) < 0)
   {
-    sql_print_warning("Can't create test file %s", buff);
+    if (!opt_help)
+      sql_print_warning("Can't create test file %s", buff);
     DBUG_RETURN(-1);
   }
   mysql_file_close(file, MYF(0));
diff --git a/sql/mysqld.h b/sql/mysqld.h
index de2db372327..281082e2d7e 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -204,7 +204,6 @@ extern int bootstrap_error;
 extern I_List<THD> threads;
 extern char err_shared_dir[];
 extern TYPELIB thread_handling_typelib;
-extern my_decimal decimal_zero;
 
 /*
   THR_MALLOC is a key which will be used to set/get MEM_ROOT** for a thread,
@@ -241,14 +240,15 @@ extern PSI_mutex_key key_RELAYLOG_LOCK_index;
 
 extern PSI_mutex_key key_LOCK_stats,
   key_LOCK_global_user_client_stats, key_LOCK_global_table_stats,
-  key_LOCK_global_index_stats;
+  key_LOCK_global_index_stats, key_LOCK_wakeup_ready;
 
 extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger,
   key_rwlock_LOCK_sys_init_connect, key_rwlock_LOCK_sys_init_slave,
   key_rwlock_LOCK_system_variables_hash, key_rwlock_query_cache_query_lock;
 
 #ifdef HAVE_MMAP
-extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
+extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool,
+                    key_COND_queue_busy;
 #endif /* HAVE_MMAP */
 
 extern PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
@@ -261,7 +261,7 @@ extern PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
   key_relay_log_info_start_cond, key_relay_log_info_stop_cond,
   key_TABLE_SHARE_cond, key_user_level_lock_cond,
   key_COND_thread_count, key_COND_thread_cache, key_COND_flush_thread_cache;
-extern PSI_cond_key key_RELAYLOG_update_cond;
+extern PSI_cond_key key_RELAYLOG_update_cond, key_COND_wakeup_ready;
 
 extern PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
   key_thread_handle_manager, key_thread_kill_server, key_thread_main,
@@ -321,7 +321,7 @@ extern MYSQL_PLUGIN_IMPORT key_map key_map_full;          /* Should be threaded
  */
 extern mysql_mutex_t
        LOCK_user_locks, LOCK_status,
-       LOCK_error_log, LOCK_delayed_insert, LOCK_uuid_generator,
+       LOCK_error_log, LOCK_delayed_insert, LOCK_short_uuid_generator,
        LOCK_delayed_status, LOCK_delayed_create, LOCK_crypt, LOCK_timezone,
        LOCK_slave_list, LOCK_active_mi, LOCK_manager,
        LOCK_global_system_variables, LOCK_user_conn,
@@ -358,6 +358,7 @@ enum options_mysqld
   OPT_BINLOG_FORMAT,
   OPT_BINLOG_IGNORE_DB,
   OPT_BIN_LOG,
+  OPT_LOG_BASENAME,
   OPT_BOOTSTRAP,
   OPT_CONSOLE,
   OPT_DEBUG_SYNC_TIMEOUT,
@@ -411,7 +412,9 @@ enum enum_query_type
   /// In utf8.
   QT_TO_SYSTEM_CHARSET= (1 << 0),
   /// Without character set introducers.
-  QT_WITHOUT_INTRODUCERS= (1 << 1)
+  QT_WITHOUT_INTRODUCERS= (1 << 1),
+  /// view internal representation (like QT_ORDINARY except ORDER BY clause)
+  QT_VIEW_INTERNAL= (1 << 2)
 };
 
 /* query_id */
@@ -519,10 +522,20 @@ inline THD *_current_thd(void)
 extern handlerton *maria_hton;
 
 extern uint extra_connection_count;
-extern my_bool opt_userstat_running;
+extern my_bool opt_userstat_running, debug_assert_if_crashed_table;
 extern uint mysqld_extra_port;
+extern ulong opt_progress_report_time;
 extern ulong extra_max_connections;
 extern ulonglong denied_connections;
 extern ulong thread_created;
 extern scheduler_functions *thread_scheduler, *extra_thread_scheduler;
+extern char *opt_log_basename;
+extern my_bool opt_master_verify_checksum;
+extern my_bool opt_slave_sql_verify_checksum;
+extern ulong binlog_checksum_options;
+
+extern uint volatile global_disable_checkpoint;
+extern my_bool opt_help, opt_thread_alarm;
+extern my_bool opt_query_cache_strip_comments;
+
 #endif /* MYSQLD_INCLUDED */
diff --git a/sql/net_serv.cc b/sql/net_serv.cc
index c8a839571ee..f8a55868113 100644
--- a/sql/net_serv.cc
+++ b/sql/net_serv.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -125,7 +125,7 @@ my_bool my_net_init(NET *net, Vio* vio)
   net->vio = vio;
   my_net_local_init(net);			/* Set some limits */
   if (!(net->buff=(uchar*) my_malloc((size_t) net->max_packet+
-				     NET_HEADER_SIZE + COMP_HEADER_SIZE,
+				     NET_HEADER_SIZE + COMP_HEADER_SIZE +1,
 				     MYF(MY_WME))))
     DBUG_RETURN(1);
   net->buff_end=net->buff+net->max_packet;
@@ -602,7 +602,7 @@ net_real_write(NET *net,const uchar *packet, size_t len)
     uchar *b;
     uint header_length=NET_HEADER_SIZE+COMP_HEADER_SIZE;
     if (!(b= (uchar*) my_malloc(len + NET_HEADER_SIZE +
-                                COMP_HEADER_SIZE, MYF(MY_WME))))
+                                COMP_HEADER_SIZE + 1, MYF(MY_WME))))
     {
       net->error= 2;
       net->last_errno= ER_OUT_OF_RESOURCES;
@@ -701,7 +701,8 @@ net_real_write(NET *net,const uchar *packet, size_t len)
   {
     my_bool old_mode;
     thr_end_alarm(&alarmed);
-    vio_blocking(net->vio, net_blocking, &old_mode);
+    if (!net_blocking)
+      vio_blocking(net->vio, net_blocking, &old_mode);
   }
   net->reading_or_writing=0;
   DBUG_RETURN(((int) (pos != end)));
@@ -982,7 +983,8 @@ end:
   {
     my_bool old_mode;
     thr_end_alarm(&alarmed);
-    vio_blocking(net->vio, net_blocking, &old_mode);
+    if (!net_blocking)
+      vio_blocking(net->vio, net_blocking, &old_mode);
   }
   net->reading_or_writing=0;
 #ifdef DEBUG_DATA_PACKETS
diff --git a/sql/opt_index_cond_pushdown.cc b/sql/opt_index_cond_pushdown.cc
index 572f314162a..cee96d88438 100644
--- a/sql/opt_index_cond_pushdown.cc
+++ b/sql/opt_index_cond_pushdown.cc
@@ -26,7 +26,7 @@
     FALSE  No
 */
 
-bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno, 
+bool uses_index_fields_only(Item *item, TABLE *tbl, uint keyno,
                             bool other_tbls_ok)
 {
   if (item->const_item())
@@ -155,7 +155,11 @@ Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
 	  new_cond->argument_list()->push_back(fix);
           used_tables|= fix->used_tables();
         }
-        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+        if (test(item->marker == ICP_COND_USES_INDEX_ONLY))
+        {
+          n_marked++;
+          item->marker= 0;
+        } 
       }
       if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
         cond->marker= ICP_COND_USES_INDEX_ONLY;
@@ -184,7 +188,11 @@ Item *make_cond_for_index(Item *cond, TABLE *table, uint keyno,
 	if (!fix)
 	  return (COND*) 0;
 	new_cond->argument_list()->push_back(fix);
-        n_marked += test(item->marker == ICP_COND_USES_INDEX_ONLY);
+        if (test(item->marker == ICP_COND_USES_INDEX_ONLY))
+        {
+          n_marked++;
+          item->marker= 0;
+        } 
       }
       if (n_marked ==((Item_cond*)cond)->argument_list()->elements)
         cond->marker= ICP_COND_USES_INDEX_ONLY;
@@ -271,13 +279,12 @@ Item *make_cond_remainder(Item *cond, bool exclude_index)
       tab            A join tab that has tab->table->file and its condition
                      in tab->select_cond
       keyno          Index for which extract and push the condition
-      other_tbls_ok  TRUE <=> Fields of other non-const tables are allowed
 
   DESCRIPTION
     Try to extract and push the index condition down to table handler
 */
 
-void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
+void push_index_cond(JOIN_TAB *tab, uint keyno)
 {
   DBUG_ENTER("push_index_cond");
   Item *idx_cond;
@@ -310,7 +317,7 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
                  print_where(tab->select_cond, "full cond", QT_ORDINARY););
 
     idx_cond= make_cond_for_index(tab->select_cond, tab->table, keyno,
-                                  other_tbls_ok);
+                                  tab->icp_other_tables_ok);
 
     DBUG_EXECUTE("where",
                  print_where(idx_cond, "idx cond", QT_ORDINARY););
@@ -329,10 +336,8 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
           /*
             if cache is used then the value is TRUE only 
             for BKA[_UNIQUE] cache (see check_join_cache_usage func).
-            In this case other_tbls_ok is an equivalent of
-            cache->is_key_access().
           */
-          other_tbls_ok &&
+          tab->icp_other_tables_ok &&
           (idx_cond->used_tables() &
            ~(tab->table->map | tab->join->const_table_map)))
         tab->cache_idx_cond= idx_cond;
@@ -350,7 +355,9 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
       if (idx_remainder_cond != idx_cond)
         tab->ref.disable_cache= TRUE;
 
-      Item *row_cond= make_cond_remainder(tab->select_cond, TRUE);
+      Item *row_cond= tab->idx_cond_fact_out ? 
+                        make_cond_remainder(tab->select_cond, TRUE) :
+	                tab->pre_idx_push_select_cond;
 
       DBUG_EXECUTE("where",
                    print_where(row_cond, "remainder cond", QT_ORDINARY););
@@ -378,6 +385,7 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok)
                                  QT_ORDINARY););
 
         tab->select->cond= tab->select_cond;
+        tab->select->pre_idx_push_select_cond= tab->pre_idx_push_select_cond;
       }
     }
   }
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index e0505e2d65c..e5f544747e5 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -310,6 +310,11 @@ public:
   uint8 part;					// Which key part
   uint8 maybe_null;
   /* 
+    The ordinal number the least significant component encountered in
+    the ranges of the SEL_ARG tree (the first component has number 1) 
+  */
+  uint16 max_part_no; 
+  /* 
     Number of children of this element in the RB-tree, plus 1 for this
     element itself.
   */
@@ -342,8 +347,9 @@ public:
   SEL_ARG(Field *field, uint8 part, uchar *min_value, uchar *max_value,
 	  uint8 min_flag, uint8 max_flag, uint8 maybe_flag);
   SEL_ARG(enum Type type_arg)
-    :min_flag(0),elements(1),use_count(1),left(0),right(0),next_key_part(0),
-    color(BLACK), type(type_arg)
+    :min_flag(0), max_part_no(0) /* first key part means 1. 0 mean 'no parts'*/, 
+     elements(1),use_count(1),left(0),right(0),
+     next_key_part(0), color(BLACK), type(type_arg)
   {}
   inline bool is_same(SEL_ARG *arg)
   {
@@ -566,6 +572,11 @@ public:
 	  pos->increment_use_count(count);
     }
   }
+  void incr_refs()
+  {
+    increment_use_count(1);
+    use_count++;
+  }
   void free_tree()
   {
     for (SEL_ARG *pos=first(); pos ; pos=pos->next)
@@ -627,7 +638,100 @@ public:
 
 class SEL_IMERGE;
 
+#define CLONE_KEY1_MAYBE 1
+#define CLONE_KEY2_MAYBE 2
+#define swap_clone_flag(A) ((A & 1) << 1) | ((A & 2) >> 1)
 
+
+/*
+  While objects of the class SEL_ARG represent ranges for indexes or
+  index infixes (including ranges for index prefixes and index suffixes),
+  objects of the class SEL_TREE represent AND/OR formulas of such ranges.
+  Currently an AND/OR formula represented by a SEL_TREE object can have
+  at most three levels: 
+
+    <SEL_TREE formula> ::= 
+      [ <SEL_RANGE_TREE formula> AND ]
+      [ <SEL_IMERGE formula> [ AND <SEL_IMERGE formula> ...] ]
+
+    <SEL_RANGE_TREE formula> ::=
+      <SEL_ARG formula> [ AND  <SEL_ARG_formula> ... ]
+
+    <SEL_IMERGE formula> ::=  
+      <SEL_RANGE_TREE formula> [ OR <SEL_RANGE_TREE formula> ]
+
+  As we can see from the above definitions:
+   - SEL_RANGE_TREE formula is a conjunction of SEL_ARG formulas
+   - SEL_IMERGE formula is a disjunction of SEL_RANGE_TREE formulas
+   - SEL_TREE formula is a conjunction of a SEL_RANGE_TREE formula
+     and SEL_IMERGE formulas. 
+  It's required above that a SEL_TREE formula has at least one conjunct.
+
+  Usually we will consider normalized SEL_RANGE_TREE formulas where we use
+  TRUE as conjunct members for those indexes whose SEL_ARG trees are empty.
+  
+  We will call an SEL_TREE object simply 'tree'. 
+  The part of a tree that represents SEL_RANGE_TREE formula is called
+  'range part' of the tree while the remaining part is called 'imerge part'. 
+  If a tree contains only a range part then we call such a tree 'range tree'.
+  Components of a range tree that represent SEL_ARG formulas are called ranges.
+  If a tree does not contain any range part we call such a tree 'imerge tree'.
+  Components of the imerge part of a tree that represent SEL_IMERGE formula
+  are called imerges.
+
+  Usually we'll designate:
+    SEL_TREE formulas         by T_1,...,T_k
+    SEL_ARG formulas          by R_1,...,R_k
+    SEL_RANGE_TREE formulas   by RT_1,...,RT_k
+    SEL_IMERGE formulas       by M_1,...,M_k
+  Accordingly we'll use:
+    t_1,...,t_k - to designate trees representing T_1,...,T_k
+    r_1,...,r_k - to designate ranges representing R_1,...,R_k 
+    rt_1,...,r_tk - to designate range trees representing RT_1,...,RT_k
+    m_1,...,m_k - to designate imerges representing M_1,...,M_k
+
+  SEL_TREE objects are usually built from WHERE conditions or
+  ON expressions.
+  A SEL_TREE object always represents an inference of the condition it is
+  built from. Therefore, if a row satisfies a SEL_TREE formula it also
+  satisfies the condition it is built from.
+
+  The following transformations of tree t representing SEL_TREE formula T 
+  yield a new tree t1 thar represents an inference of T: T=>T1.  
+    (1) remove any of SEL_ARG tree from the range part of t
+    (2) remove any imerge from the tree t 
+    (3) remove any of SEL_ARG tree from any range tree contained
+        in any imerge of tree   
+ 
+  Since the basic blocks of any SEL_TREE objects are ranges, SEL_TREE
+  objects in many cases can be effectively used to filter out a big part
+  of table rows that do not satisfy WHERE/IN conditions utilizing
+  only single or multiple range index scans.
+
+  A single range index scan is constructed for a range tree that contains
+  only one SEL_ARG object for an index or an index prefix.
+  An index intersection scan can be constructed for a range tree
+  that contains several SEL_ARG objects. Currently index intersection
+  scans are constructed only for single-point ranges.
+  An index merge scan is constructed for a imerge tree that contains only
+  one imerge. If range trees of this imerge contain only single-point merges
+  than a union of index intersections can be built.
+
+  Usually the tree built by the range optimizer for a query table contains
+  more than one range in the range part, and additionally may contain some
+  imerges in the imerge part. The range optimizer evaluates all of them one
+  by one and chooses the range or the imerge that provides the cheapest
+  single or multiple range index scan of the table.  According to rules 
+  (1)-(3) this scan always filter out only those rows that do not satisfy
+  the query conditions. 
+
+  For any condition the SEL_TREE object for it is built in a bottom up
+  manner starting from the range trees for the predicates. The tree_and
+  function builds a tree for any conjunction of formulas from the trees
+  for its conjuncts. The tree_or function builds a tree for any disjunction
+  of formulas from the trees for its disjuncts.    
+*/ 
+  
 class SEL_TREE :public Sql_alloc
 {
 public:
@@ -643,7 +747,7 @@ public:
     keys_map.clear_all();
     bzero((char*) keys,sizeof(keys));
   }
-  SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param);
+  SEL_TREE(SEL_TREE *arg, bool without_merges, RANGE_OPT_PARAM *param);
   /*
     Note: there may exist SEL_TREE objects with sel_tree->type=KEY and
     keys[i]=0 for all i. (SergeyP: it is not clear whether there is any
@@ -663,9 +767,15 @@ public:
   key_map ror_scans_map;   /* bitmask of ROR scan-able elements in keys */
   uint    n_ror_scans;     /* number of set bits in ror_scans_map */
 
+  struct st_index_scan_info **index_scans;     /* list of index scans */
+  struct st_index_scan_info **index_scans_end; /* last index scan */
+
   struct st_ror_scan_info **ror_scans;     /* list of ROR key scans */
   struct st_ror_scan_info **ror_scans_end; /* last ROR scan */
   /* Note that #records for each key scan is stored in table->quick_rows */
+
+  bool without_ranges() { return keys_map.is_clear_all(); }
+  bool without_imerges() { return merges.is_empty(); }
 };
 
 class RANGE_OPT_PARAM
@@ -719,12 +829,13 @@ public:
   /* Number of SEL_ARG objects allocated by SEL_ARG::clone_tree operations */
   uint alloced_sel_args; 
   bool force_default_mrr;
+  KEY_PART *key[MAX_KEY]; /* First key parts of keys used in the query */
 };
 
 class PARAM : public RANGE_OPT_PARAM
 {
 public:
-  KEY_PART *key[MAX_KEY]; /* First key parts of keys used in the query */
+  ha_rows quick_rows[MAX_KEY];
   longlong baseflag;
   uint max_key_part, range_count;
 
@@ -751,9 +862,11 @@ class TABLE_READ_PLAN;
   class TRP_RANGE;
   class TRP_ROR_INTERSECT;
   class TRP_ROR_UNION;
-  class TRP_ROR_INDEX_MERGE;
+  class TRP_INDEX_INTERSECT;
+  class TRP_INDEX_MERGE;
   class TRP_GROUP_MIN_MAX;
 
+struct st_index_scan_info;
 struct st_ror_scan_info;
 
 static SEL_TREE * get_mm_parts(RANGE_OPT_PARAM *param,COND *cond_func,Field *field,
@@ -778,6 +891,9 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                        bool update_tbl_stats,
                                        double read_time);
 static
+TRP_INDEX_INTERSECT *get_best_index_intersect(PARAM *param, SEL_TREE *tree,
+                                              double read_time);
+static
 TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
                                           double read_time,
                                           bool *are_all_covering);
@@ -789,6 +905,10 @@ static
 TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
                                          double read_time);
 static
+TABLE_READ_PLAN *merge_same_index_scans(PARAM *param, SEL_IMERGE *imerge,
+                                        TRP_INDEX_MERGE *imerge_trp,
+                                        double read_time);
+static
 TRP_GROUP_MIN_MAX *get_best_group_min_max(PARAM *param, SEL_TREE *tree,
                                           double read_time);
 
@@ -801,11 +921,15 @@ static void print_ror_scans_arr(TABLE *table, const char *msg,
 static void print_quick(QUICK_SELECT_I *quick, const key_map *needed_reg);
 #endif
 
-static SEL_TREE *tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2);
-static SEL_TREE *tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2);
+static SEL_TREE *tree_and(RANGE_OPT_PARAM *param,
+                          SEL_TREE *tree1, SEL_TREE *tree2);
+static SEL_TREE *tree_or(RANGE_OPT_PARAM *param,
+                         SEL_TREE *tree1,SEL_TREE *tree2);
 static SEL_ARG *sel_add(SEL_ARG *key1,SEL_ARG *key2);
-static SEL_ARG *key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2);
-static SEL_ARG *key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2,
+static SEL_ARG *key_or(RANGE_OPT_PARAM *param,
+                       SEL_ARG *key1, SEL_ARG *key2);
+static SEL_ARG *key_and(RANGE_OPT_PARAM *param,
+                        SEL_ARG *key1, SEL_ARG *key2,
                         uint clone_flag);
 static bool get_range(SEL_ARG **e1,SEL_ARG **e2,SEL_ARG *root1);
 bool get_quick_keys(PARAM *param,QUICK_RANGE_SELECT *quick,KEY_PART *key,
@@ -816,11 +940,27 @@ static bool eq_tree(SEL_ARG* a,SEL_ARG *b);
 static SEL_ARG null_element(SEL_ARG::IMPOSSIBLE);
 static bool null_part_in_key(KEY_PART *key_part, const uchar *key,
                              uint length);
-bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2, RANGE_OPT_PARAM* param);
 static bool is_key_scan_ror(PARAM *param, uint keynr, uint8 nparts);
 
 #include "opt_range_mrr.cc"
 
+static bool sel_trees_have_common_keys(SEL_TREE *tree1, SEL_TREE *tree2, 
+                                       key_map *common_keys);
+static void eliminate_single_tree_imerges(RANGE_OPT_PARAM *param,
+                                          SEL_TREE *tree);
+
+static bool sel_trees_can_be_ored(RANGE_OPT_PARAM* param,
+                                  SEL_TREE *tree1, SEL_TREE *tree2, 
+                                  key_map *common_keys);
+static bool sel_trees_must_be_ored(RANGE_OPT_PARAM* param,
+                                   SEL_TREE *tree1, SEL_TREE *tree2,
+                                   key_map common_keys);
+static int and_range_trees(RANGE_OPT_PARAM *param,
+                           SEL_TREE *tree1, SEL_TREE *tree2,
+                           SEL_TREE *result);
+static bool remove_nonrange_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree);
+
+
 /*
   SEL_IMERGE is a list of possible ways to do index merge, i.e. it is
   a condition in the following form:
@@ -850,23 +990,39 @@ public:
     trees_next(trees),
     trees_end(trees + PREALLOCED_TREES)
   {}
-  SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param);
+  SEL_IMERGE (SEL_IMERGE *arg, uint cnt, RANGE_OPT_PARAM *param);
   int or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree);
-  int or_sel_tree_with_checks(RANGE_OPT_PARAM *param, SEL_TREE *new_tree);
-  int or_sel_imerge_with_checks(RANGE_OPT_PARAM *param, SEL_IMERGE* imerge);
+  bool have_common_keys(RANGE_OPT_PARAM *param, SEL_TREE *tree);
+  int and_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree, 
+                   SEL_IMERGE *new_imerge);
+  int or_sel_tree_with_checks(RANGE_OPT_PARAM *param,
+                              uint n_init_trees, 
+                              SEL_TREE *new_tree,
+                              bool is_first_check_pass,
+                              bool *is_last_check_pass);
+  int or_sel_imerge_with_checks(RANGE_OPT_PARAM *param,
+                                uint n_init_trees,
+                                SEL_IMERGE* imerge,
+                                bool is_first_check_pass,
+                                bool *is_last_check_pass);
 };
 
 
 /*
-  Add SEL_TREE to this index_merge without any checks,
+  Add a range tree to the range trees of this imerge 
 
-  NOTES
-    This function implements the following:
-      (x_1||...||x_N) || t = (x_1||...||x_N||t), where x_i, t are SEL_TREEs
+  SYNOPSIS
+    or_sel_tree()
+      param                  Context info for the operation         
+      tree                   SEL_TREE to add to this imerge 
+
+  DESCRIPTION 
+    The function just adds the range tree 'tree' to the range trees
+    of this imerge.
 
   RETURN
-     0 - OK
-    -1 - Out of memory.
+     0   if the operation is success
+    -1   if the function runs out memory
 */
 
 int SEL_IMERGE::or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree)
@@ -891,96 +1047,303 @@ int SEL_IMERGE::or_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree)
 
 
 /*
-  Perform OR operation on this SEL_IMERGE and supplied SEL_TREE new_tree,
-  combining new_tree with one of the trees in this SEL_IMERGE if they both
-  have SEL_ARGs for the same key.
+  Check if any of the range trees of this imerge intersects with a given tree 
 
   SYNOPSIS
-    or_sel_tree_with_checks()
-      param    PARAM from SQL_SELECT::test_quick_select
-      new_tree SEL_TREE with type KEY or KEY_SMALLER.
+    have_common_keys()
+      param    Context info for the function
+      tree     SEL_TREE intersection with the imerge range trees is checked for 
 
-  NOTES
-    This does the following:
-    (t_1||...||t_k)||new_tree =
-     either
-       = (t_1||...||t_k||new_tree)
-     or
-       = (t_1||....||(t_j|| new_tree)||...||t_k),
-
-     where t_i, y are SEL_TREEs.
-    new_tree is combined with the first t_j it has a SEL_ARG on common
-    key with. As a consequence of this, choice of keys to do index_merge
-    read may depend on the order of conditions in WHERE part of the query.
+  DESCRIPTION
+    The function checks whether there is any range tree rt_i in this imerge
+    such that there are some indexes for which ranges are defined in both
+    rt_i and the range part of the SEL_TREE tree.  
+    To check this the function calls the function sel_trees_have_common_keys.
 
+  RETURN 
+    TRUE    if there are such range trees in this imerge
+    FALSE   otherwise
+*/
+
+bool SEL_IMERGE::have_common_keys(RANGE_OPT_PARAM *param, SEL_TREE *tree)
+{
+  for (SEL_TREE** or_tree= trees, **bound= trees_next;
+       or_tree != bound; or_tree++)
+  {
+    key_map common_keys;
+    if (sel_trees_have_common_keys(*or_tree, tree, &common_keys))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/* 
+  Perform AND operation for this imerge and the range part of a tree
+
+  SYNOPSIS
+    and_sel_tree()
+      param           Context info for the operation
+      tree            SEL_TREE for the second operand of the operation
+      new_imerge  OUT imerge for the result of the operation
+
+  DESCRIPTION
+    This function performs AND operation for this imerge m and the
+    range part of the SEL_TREE tree rt. In other words the function
+    pushes rt into this imerge. The resulting imerge is returned in
+    the parameter new_imerge.
+    If this imerge m represent the formula
+      RT_1 OR ... OR RT_k
+    then the resulting imerge of the function represents the formula
+      (RT_1 AND RT) OR ... OR (RT_k AND RT)
+    The function calls the function and_range_trees to construct the
+    range tree representing (RT_i AND RT).
+    
+  NOTE
+    The function may return an empty imerge without any range trees.
+    This happens when each call of and_range_trees returns an 
+    impossible range tree (SEL_TREE::IMPOSSIBLE).
+    Example: (key1 < 2 AND key2 > 10) AND (key1 > 4 OR key2 < 6).
+         
   RETURN
-    0  OK
-    1  One of the trees was combined with new_tree to SEL_TREE::ALWAYS,
-       and (*this) should be discarded.
-   -1  An error occurred.
+     0  if the operation is a success
+    -1  otherwise: there is not enough memory to perform the operation
 */
 
-int SEL_IMERGE::or_sel_tree_with_checks(RANGE_OPT_PARAM *param, SEL_TREE *new_tree)
+int SEL_IMERGE::and_sel_tree(RANGE_OPT_PARAM *param, SEL_TREE *tree,
+                             SEL_IMERGE *new_imerge)
 {
-  for (SEL_TREE** tree = trees;
-       tree != trees_next;
-       tree++)
+  for (SEL_TREE** or_tree= trees; or_tree != trees_next; or_tree++) 
   {
-    if (sel_trees_can_be_ored(*tree, new_tree, param))
+    SEL_TREE *res_or_tree= 0;
+    if (!(res_or_tree= new SEL_TREE()))
+      return (-1);
+    if (!and_range_trees(param, *or_tree, tree, res_or_tree))
     {
-      *tree = tree_or(param, *tree, new_tree);
-      if (!*tree)
-        return 1;
-      if (((*tree)->type == SEL_TREE::MAYBE) ||
-          ((*tree)->type == SEL_TREE::ALWAYS))
+      if (new_imerge->or_sel_tree(param, res_or_tree))
+        return (-1);
+    }        
+  }
+  return 0;
+}      
+
+
+/*
+  Perform OR operation on this imerge and the range part of a tree
+
+  SYNOPSIS
+    or_sel_tree_with_checks()
+      param                  Context info for the operation 
+      n_trees                Number of trees in this imerge to check for oring        
+      tree                   SEL_TREE whose range part is to be ored 
+      is_first_check_pass    <=> the first call of the function for this imerge  
+      is_last_check_pass OUT <=> no more calls of the function for this imerge
+
+  DESCRIPTION
+    The function performs OR operation on this imerge m and the range part
+    of the SEL_TREE tree rt. It always replaces this imerge with the result
+    of the operation.
+ 
+    The operation can be performed in two different modes: with
+    is_first_check_pass==TRUE and is_first_check_pass==FALSE, transforming
+    this imerge differently.
+
+    Given this imerge represents the formula
+      RT_1 OR ... OR RT_k:
+
+    1. In the first mode, when is_first_check_pass==TRUE :
+      1.1. If rt must be ored(see the function sel_trees_must_be_ored) with
+           some rt_j (there may be only one such range tree in the imerge)
+           then the function produces an imerge representing the formula
+             RT_1 OR ... OR (RT_j OR RT) OR ... OR RT_k,
+           where the tree for (RT_j OR RT) is built by oring the pairs
+           of SEL_ARG trees for the corresponding indexes
+      1.2. Otherwise the function produces the imerge representing the formula:
+           RT_1 OR ... OR RT_k OR RT.
+
+    2. In the second mode, when is_first_check_pass==FALSE :
+      2.1. For each rt_j in the imerge that can be ored (see the function
+           sel_trees_can_be_ored), but not must be ored, with rt the function
+           replaces rt_j for a range tree such that for each index for which
+           ranges are defined in both in rt_j and rt  the tree contains the
+           result of oring of these ranges.
+      2.2. In other cases the function does not produce any imerge.
+
+    When is_first_check==TRUE the function returns FALSE in the parameter
+    is_last_check_pass if there is no rt_j such that rt_j can be ored with rt,
+    but, at the same time, it's not true that rt_j must be ored with rt.
+    When is_first_check==FALSE the function always returns FALSE in the
+    parameter is_last_check_pass.    
+          
+  RETURN
+    1  The result of oring of rt_j and rt that must be ored returns the
+       the range tree with type==SEL_TREE::ALWAYS
+       (in this case the imerge m should be discarded)
+   -1  The function runs out of memory
+    0  in all other cases 
+*/
+
+int SEL_IMERGE::or_sel_tree_with_checks(RANGE_OPT_PARAM *param,
+                                        uint n_trees,
+                                        SEL_TREE *tree,
+                                        bool is_first_check_pass,
+                                        bool *is_last_check_pass)
+{
+  bool was_ored= FALSE;
+  *is_last_check_pass= TRUE;
+  SEL_TREE** or_tree = trees;
+  for (uint i= 0; i < n_trees; i++, or_tree++)
+  {
+    SEL_TREE *result= 0;
+    key_map result_keys;
+    key_map ored_keys;
+    if (sel_trees_can_be_ored(param, *or_tree, tree, &ored_keys))
+    {
+      bool must_be_ored= sel_trees_must_be_ored(param, *or_tree, tree,
+                                                ored_keys);
+      if (must_be_ored || !is_first_check_pass) 
+      {
+        result_keys.clear_all();
+        result= *or_tree;
+        for (uint key_no= 0; key_no < param->keys; key_no++)
+        {
+          if (!ored_keys.is_set(key_no))
+	  {
+            result->keys[key_no]= 0;
+	    continue;
+          }
+          SEL_ARG *key1= (*or_tree)->keys[key_no];
+          SEL_ARG *key2= tree->keys[key_no];
+          key2->incr_refs();
+          if ((result->keys[key_no]= key_or(param, key1, key2)))
+          {
+            
+            result_keys.set_bit(key_no);
+#ifdef EXTRA_DEBUG
+            if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS)
+	    {
+              key1= result->keys[key_no]; 
+              (key1)->test_use_count(key1);
+            }
+#endif
+          }       
+        }
+      }
+      else if(is_first_check_pass) 
+        *is_last_check_pass= FALSE;
+    } 
+
+    if (result)
+    {
+      if (result_keys.is_clear_all())
+        result->type= SEL_TREE::ALWAYS;
+      *is_last_check_pass= TRUE;
+      if ((result->type == SEL_TREE::MAYBE) ||
+          (result->type == SEL_TREE::ALWAYS))
         return 1;
       /* SEL_TREE::IMPOSSIBLE is impossible here */
-      return 0;
+      result->keys_map= result_keys; 
+      *or_tree= result;
+      if (is_first_check_pass)
+        return 0;
+      was_ored= TRUE;
     }
   }
+  if (was_ored)
+    return 0;
 
-  /* New tree cannot be combined with any of existing trees. */
-  return or_sel_tree(param, new_tree);
+  if (!*is_last_check_pass && 
+      !(tree= new SEL_TREE(tree, FALSE, param)))
+    return (-1);
+  return or_sel_tree(param, tree);
 }
 
 
 /*
-  Perform OR operation on this index_merge and supplied index_merge list.
+  Perform OR operation on this imerge and and another imerge
+
+  SYNOPSIS
+    or_sel_imerge_with_checks()
+      param                  Context info for the operation 
+      n_trees           Number of trees in this imerge to check for oring        
+      imerge                 The second operand of the operation 
+      is_first_check_pass    <=> the first call of the function for this imerge  
+      is_last_check_pass OUT <=> no more calls of the function for this imerge
 
+  DESCRIPTION
+    For each range tree rt from 'imerge' the function calls the method
+    SEL_IMERGE::or_sel_tree_with_checks that performs OR operation on this
+    SEL_IMERGE object m and the tree rt. The mode of the operation is
+    specified by the parameter is_first_check_pass. Each call of
+    SEL_IMERGE::or_sel_tree_with_checks transforms this SEL_IMERGE object m.
+    The function returns FALSE in the prameter is_last_check_pass if
+    at least one of the calls of SEL_IMERGE::or_sel_tree_with_checks
+    returns FALSE as the value of its last parameter. 
+    
   RETURN
-    0 - OK
-    1 - One of conditions in result is always TRUE and this SEL_IMERGE
-        should be discarded.
-   -1 - An error occurred
+    1  One of the calls of SEL_IMERGE::or_sel_tree_with_checks returns 1.
+       (in this case the imerge m should be discarded)
+   -1  The function runs out of memory
+    0  in all other cases 
 */
 
-int SEL_IMERGE::or_sel_imerge_with_checks(RANGE_OPT_PARAM *param, SEL_IMERGE* imerge)
-{
-  for (SEL_TREE** tree= imerge->trees;
-       tree != imerge->trees_next;
-       tree++)
-  {
-    if (or_sel_tree_with_checks(param, *tree))
-      return 1;
+int SEL_IMERGE::or_sel_imerge_with_checks(RANGE_OPT_PARAM *param,
+                                          uint n_trees,
+                                          SEL_IMERGE* imerge,
+                                          bool is_first_check_pass,
+                                          bool *is_last_check_pass)
+{
+  *is_last_check_pass= TRUE;
+  SEL_TREE** tree= imerge->trees;
+  SEL_TREE** tree_end= imerge->trees_next;
+  for ( ; tree < tree_end; tree++)
+  {
+    uint rc;
+    bool is_last= TRUE; 
+    rc= or_sel_tree_with_checks(param, n_trees, *tree, 
+                               is_first_check_pass, &is_last);
+    if (!is_last)
+      *is_last_check_pass= FALSE;
+    if (rc)
+      return rc;
   }
   return 0;
 }
 
 
-SEL_TREE::SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param): Sql_alloc()
+/*
+  Copy constructor for SEL_TREE objects
+
+  SYNOPSIS
+    SEL_TREE
+      arg            The source tree for the constructor
+      without_merges <=> only the range part of the tree arg is copied
+      param          Context info for the operation
+
+  DESCRIPTION
+    The constructor creates a full copy of the SEL_TREE arg if
+    the prameter without_merges==FALSE. Otherwise a tree is created
+    that contains the copy only of the range part of the tree arg. 
+*/ 
+
+SEL_TREE::SEL_TREE(SEL_TREE *arg, bool without_merges,
+                   RANGE_OPT_PARAM *param): Sql_alloc()
 {
   keys_map= arg->keys_map;
   type= arg->type;
-  for (int idx= 0; idx < MAX_KEY; idx++)
+  for (uint idx= 0; idx < param->keys; idx++)
   {
     if ((keys[idx]= arg->keys[idx]))
-      keys[idx]->increment_use_count(1);
+      keys[idx]->incr_refs();
   }
 
+  if (without_merges)
+    return;
+
   List_iterator<SEL_IMERGE> it(arg->merges);
   for (SEL_IMERGE *el= it++; el; el= it++)
   {
-    SEL_IMERGE *merge= new SEL_IMERGE(el, param);
+    SEL_IMERGE *merge= new SEL_IMERGE(el, 0, param);
     if (!merge || merge->trees == merge->trees_next)
     {
       merges.empty();
@@ -991,7 +1354,23 @@ SEL_TREE::SEL_TREE(SEL_TREE *arg, RANGE_OPT_PARAM *param): Sql_alloc()
 }
 
 
-SEL_IMERGE::SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param) : Sql_alloc()
+/*
+  Copy constructor for SEL_IMERGE objects
+
+  SYNOPSIS
+    SEL_IMERGE
+      arg         The source imerge for the constructor
+      cnt         How many trees from arg are to be copied
+      param       Context info for the operation
+
+  DESCRIPTION
+    The cnt==0 then the constructor creates a full copy of the 
+    imerge arg. Otherwise only the first cnt trees of the imerge
+    are copied.
+*/ 
+
+SEL_IMERGE::SEL_IMERGE(SEL_IMERGE *arg, uint cnt,
+                       RANGE_OPT_PARAM *param) : Sql_alloc()
 {
   uint elements= (arg->trees_end - arg->trees);
   if (elements > PREALLOCED_TREES)
@@ -1003,13 +1382,13 @@ SEL_IMERGE::SEL_IMERGE (SEL_IMERGE *arg, RANGE_OPT_PARAM *param) : Sql_alloc()
   else
     trees= &trees_prealloced[0];
 
-  trees_next= trees;
+  trees_next= trees + (cnt ? cnt : arg->trees_next-arg->trees);
   trees_end= trees + elements;
 
-  for (SEL_TREE **tree = trees, **arg_tree= arg->trees; tree < trees_end; 
+  for (SEL_TREE **tree = trees, **arg_tree= arg->trees; tree < trees_next; 
        tree++, arg_tree++)
   {
-    if (!(*tree= new SEL_TREE(*arg_tree, param)))
+    if (!(*tree= new SEL_TREE(*arg_tree, FALSE, param)))
       goto mem_err;
   }
 
@@ -1023,7 +1402,19 @@ mem_err:
 
 
 /*
-  Perform AND operation on two index_merge lists and store result in *im1.
+  Perform AND operation on two imerge lists
+
+  SYNOPSIS
+    imerge_list_and_list()
+      param             Context info for the operation         
+      im1               The first imerge list for the operation
+      im2               The second imerge list for the operation
+
+  DESCRIPTION
+    The function just appends the imerge list im2 to the imerge list im1  
+    
+  RETURN VALUE
+    none
 */
 
 inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
@@ -1033,73 +1424,242 @@ inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
 
 
 /*
-  Perform OR operation on 2 index_merge lists, storing result in first list.
-
-  NOTES
-    The following conversion is implemented:
-     (a_1 &&...&& a_N)||(b_1 &&...&& b_K) = AND_i,j(a_i || b_j) =>
-      => (a_1||b_1).
-
-    i.e. all conjuncts except the first one are currently dropped.
-    This is done to avoid producing N*K ways to do index_merge.
-
-    If (a_1||b_1) produce a condition that is always TRUE, NULL is returned
-    and index_merge is discarded (while it is actually possible to try
-    harder).
-
-    As a consequence of this, choice of keys to do index_merge read may depend
-    on the order of conditions in WHERE part of the query.
+  Perform OR operation on two imerge lists
 
+  SYNOPSIS
+    imerge_list_or_list()
+      param             Context info for the operation         
+      im1               The first imerge list for the operation
+      im2               The second imerge list for the operation
+     
+  DESCRIPTION
+    Assuming that the first imerge list represents the formula
+      F1= M1_1 AND ... AND M1_k1 
+    while the second imerge list represents the formula 
+      F2= M2_1 AND ... AND M2_k2,
+    where M1_i= RT1_i_1 OR ... OR RT1_i_l1i (i in [1..k1])
+    and M2_i = RT2_i_1 OR ... OR RT2_i_l2i (i in [1..k2]),
+    the function builds a list of imerges for some formula that can be 
+    inferred from the formula (F1 OR F2).
+
+    More exactly the function builds imerges for the formula (M1_1 OR M2_1).
+    Note that
+      (F1 OR F2) = (M1_1 AND ... AND M1_k1) OR (M2_1 AND ... AND M2_k2) =
+      AND (M1_i OR M2_j) (i in [1..k1], j in [1..k2]) =>
+      M1_1 OR M2_1.
+    So (M1_1 OR M2_1) is indeed an inference formula for (F1 OR F2).
+
+    To build imerges for the formula (M1_1 OR M2_1) the function invokes,
+    possibly twice, the method SEL_IMERGE::or_sel_imerge_with_checks
+    for the imerge m1_1.
+    At its first invocation the method SEL_IMERGE::or_sel_imerge_with_checks
+    performs OR operation on the imerge m1_1 and the range tree rt2_1_1 by
+    calling SEL_IMERGE::or_sel_tree_with_checks with is_first_pass_check==TRUE.
+    The resulting imerge of the operation is ored with the next range tree of
+    the imerge m2_1. This oring continues until the last range tree from
+    m2_1 has been ored. 
+    At its second invocation the method SEL_IMERGE::or_sel_imerge_with_checks
+    performs the same sequence of OR operations, but now calling
+    SEL_IMERGE::or_sel_tree_with_checks with is_first_pass_check==FALSE.
+
+    The imerges that the operation produces replace those in the list im1   
+       
   RETURN
-    0     OK, result is stored in *im1
-    other Error, both passed lists are unusable
+    0     if the operation is a success 
+   -1     if the function has run out of memory 
 */
 
 int imerge_list_or_list(RANGE_OPT_PARAM *param,
                         List<SEL_IMERGE> *im1,
                         List<SEL_IMERGE> *im2)
 {
+
+  uint rc;
+  bool is_last_check_pass= FALSE;
+
   SEL_IMERGE *imerge= im1->head();
+  uint elems= imerge->trees_next-imerge->trees;
   im1->empty();
   im1->push_back(imerge);
 
-  return imerge->or_sel_imerge_with_checks(param, im2->head());
+  rc= imerge->or_sel_imerge_with_checks(param, elems, im2->head(),
+                                        TRUE, &is_last_check_pass);
+  if (rc)
+  {
+    if (rc == 1)
+    {
+      im1->empty();
+      rc= 0;
+    }
+    return rc;
+  }
+
+  if (!is_last_check_pass)
+  {
+    SEL_IMERGE* new_imerge= new SEL_IMERGE(imerge, elems, param);
+    if (new_imerge)
+    {
+      is_last_check_pass= TRUE;
+      rc= new_imerge->or_sel_imerge_with_checks(param, elems, im2->head(),
+                                                 FALSE, &is_last_check_pass);
+      if (!rc)
+        im1->push_back(new_imerge); 
+    }
+  }
+  return rc;  
 }
 
 
 /*
-  Perform OR operation on index_merge list and key tree.
+  Perform OR operation for each imerge from a list and the range part of a tree
 
+  SYNOPSIS
+    imerge_list_or_tree()
+      param       Context info for the operation
+      merges      The list of imerges to be ored with the range part of tree          
+      tree        SEL_TREE whose range part is to be ored with the imerges
+
+  DESCRIPTION
+    For each imerge mi from the list 'merges' the function performes OR
+    operation with mi and the range part of 'tree' rt, producing one or
+    two imerges.
+
+    Given the merge mi represent the formula RTi_1 OR ... OR RTi_k, 
+    the function forms the merges by the following rules:
+ 
+    1. If rt cannot be ored with any of the trees rti the function just
+       produces an imerge that represents the formula
+         RTi_1 OR ... RTi_k OR RT.
+    2. If there exist a tree rtj that must be ored with rt the function
+       produces an imerge the represents the formula
+         RTi_1 OR ... OR (RTi_j OR RT) OR ... OR RTi_k,
+       where the range tree for (RTi_j OR RT) is constructed by oring the
+       SEL_ARG trees that must be ored.
+    3. For each rti_j that can be ored with rt the function produces
+       the new tree rti_j' and substitutes rti_j for this new range tree.
+
+    In any case the function removes mi from the list and then adds all
+    produced imerges.
+
+    To build imerges by rules 1-3 the function calls the method
+    SEL_IMERGE::or_sel_tree_with_checks, possibly twice. With the first
+    call it passes TRUE for the third parameter of the function.
+    At this first call imerges by rules 1-2 are built. If the call
+    returns FALSE as the return value of its fourth parameter then the
+    function are called for the second time. At this call the imerge
+    of rule 3 is produced.
+
+    If a call of SEL_IMERGE::or_sel_tree_with_checks returns 1 then
+    then it means that the produced tree contains an always true
+    range tree and the whole imerge can be discarded.
+    
   RETURN
-    0     OK, result is stored in *im1.
-    other Error
+    1     if no imerges are produced
+    0     otherwise
 */
 
+static
 int imerge_list_or_tree(RANGE_OPT_PARAM *param,
-                        List<SEL_IMERGE> *im1,
+                        List<SEL_IMERGE> *merges,
                         SEL_TREE *tree)
 {
+
   SEL_IMERGE *imerge;
-  List_iterator<SEL_IMERGE> it(*im1);
-  bool tree_used= FALSE;
+  List<SEL_IMERGE> additional_merges;
+  List_iterator<SEL_IMERGE> it(*merges);
+  
   while ((imerge= it++))
   {
-    SEL_TREE *or_tree;
-    if (tree_used)
+    bool is_last_check_pass;
+    int rc= 0;
+    int rc1= 0;
+    SEL_TREE *or_tree= new SEL_TREE (tree, FALSE, param);
+    if (or_tree)
     {
-      or_tree= new SEL_TREE (tree, param);
-      if (!or_tree ||
-          (or_tree->keys_map.is_clear_all() && or_tree->merges.is_empty()))
-        return FALSE;
+      uint elems= imerge->trees_next-imerge->trees;
+      rc= imerge->or_sel_tree_with_checks(param, elems, or_tree,
+                                          TRUE, &is_last_check_pass);
+      if (!is_last_check_pass)
+      {
+        SEL_IMERGE *new_imerge= new SEL_IMERGE(imerge, elems, param);
+        if (new_imerge)
+	{ 
+          rc1= new_imerge->or_sel_tree_with_checks(param, elems, or_tree,
+                                                   FALSE, &is_last_check_pass);
+          if (!rc1)
+            additional_merges.push_back(new_imerge);
+        }
+      }
     }
-    else
-      or_tree= tree;
-
-    if (imerge->or_sel_tree_with_checks(param, or_tree))
+    if (rc || rc1 || !or_tree)
       it.remove();
-    tree_used= TRUE;
   }
-  return im1->is_empty();
+
+  merges->concat(&additional_merges);  
+  return merges->is_empty();
+}
+
+
+/*
+  Perform pushdown operation of the range part of a tree into given imerges 
+
+  SYNOPSIS
+    imerge_list_and_tree()
+      param           Context info for the operation
+      merges   IN/OUT List of imerges to push the range part of 'tree' into
+      tree            SEL_TREE whose range part is to be pushed into imerges
+
+  DESCRIPTION
+    For each imerge from the list merges the function pushes the range part
+    rt of 'tree' into the imerge. 
+    More exactly if the imerge mi from the list represents the formula
+      RTi_1 OR ... OR RTi_k 
+    the function bulds a new imerge that represents the formula
+      (RTi_1 AND RT) OR ... OR (RTi_k AND RT)
+    and adds this imerge to the list merges.
+    To perform this pushdown operation the function calls the method
+    SEL_IMERGE::and_sel_tree. 
+    For any imerge mi the new imerge is not created if for each pair of
+    trees rti_j and rt the intersection of the indexes with defined ranges
+    is empty.
+    If the result of the pushdown operation for the imerge mi returns an
+    imerge with no trees then then not only nothing is added to the list 
+    merges but mi itself is removed from the list. 
+     
+  RETURN
+    1    if no imerges are left in the list merges             
+    0    otherwise
+*/
+
+static
+int imerge_list_and_tree(RANGE_OPT_PARAM *param,
+                         List<SEL_IMERGE> *merges,
+                         SEL_TREE *tree)
+{
+  SEL_IMERGE *imerge;
+  SEL_IMERGE *new_imerge= NULL;
+  List<SEL_IMERGE> new_merges;
+  List_iterator<SEL_IMERGE> it(*merges);
+  
+  while ((imerge= it++))
+  {
+    if (!new_imerge)
+       new_imerge= new SEL_IMERGE();
+    if (imerge->have_common_keys(param, tree) && 
+        new_imerge && !imerge->and_sel_tree(param, tree, new_imerge))
+    {
+      if (new_imerge->trees == new_imerge->trees_next)
+        it.remove();
+      else
+      {         
+        new_merges.push_back(new_imerge);
+        new_imerge= NULL;
+      }
+    }
+  }
+  imerge_list_and_list(&new_merges, merges);
+  *merges= new_merges;
+  return merges->is_empty();
 }
 
 
@@ -1133,7 +1693,7 @@ SQL_SELECT *make_select(TABLE *head, table_map const_tables,
   select->read_tables=read_tables;
   select->const_tables=const_tables;
   select->head=head;
-  select->cond=conds;
+  select->cond= conds;
 
   if (head->sort.io_cache)
   {
@@ -1147,7 +1707,7 @@ SQL_SELECT *make_select(TABLE *head, table_map const_tables,
 }
 
 
-SQL_SELECT::SQL_SELECT() :quick(0),cond(0),free_cond(0)
+SQL_SELECT::SQL_SELECT() :quick(0),cond(0),pre_idx_push_select_cond(NULL),free_cond(0)
 {
   quick_keys.clear_all(); needed_reg.clear_all();
   my_b_clear(&file);
@@ -1271,7 +1831,7 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT()
         DBUG_PRINT("info", ("Freeing separate handler 0x%lx (free: %d)", (long) file,
                             free_file));
         file->ha_external_lock(current_thd, F_UNLCK);
-        file->close();
+        file->ha_close();
         delete file;
       }
     }
@@ -1284,12 +1844,18 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT()
   DBUG_VOID_RETURN;
 }
 
+/*
+  QUICK_INDEX_SORT_SELECT works as follows:
+  - Do index scans, accumulate rowids in the Unique object 
+    (Unique will also sort and de-duplicate rowids)
+  - Use rowids from unique to run a disk-ordered sweep
+*/
 
-QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT(THD *thd_param,
-                                                   TABLE *table)
+QUICK_INDEX_SORT_SELECT::QUICK_INDEX_SORT_SELECT(THD *thd_param,
+                                                 TABLE *table)
   :unique(NULL), pk_quick_select(NULL), thd(thd_param)
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::QUICK_INDEX_SORT_SELECT");
   index= MAX_KEY;
   head= table;
   bzero(&read_record, sizeof(read_record));
@@ -1297,38 +1863,48 @@ QUICK_INDEX_MERGE_SELECT::QUICK_INDEX_MERGE_SELECT(THD *thd_param,
   DBUG_VOID_RETURN;
 }
 
-int QUICK_INDEX_MERGE_SELECT::init()
+int QUICK_INDEX_SORT_SELECT::init()
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::init");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::init");
   DBUG_RETURN(0);
 }
 
-int QUICK_INDEX_MERGE_SELECT::reset()
+int QUICK_INDEX_SORT_SELECT::reset()
 {
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::reset");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::reset");
   DBUG_RETURN(read_keys_and_merge());
 }
 
 bool
-QUICK_INDEX_MERGE_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
+QUICK_INDEX_SORT_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
 {
-  /*
-    Save quick_select that does scan on clustered primary key as it will be
-    processed separately.
-  */
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::push_quick_back");
   if (head->file->primary_key_is_clustered() &&
       quick_sel_range->index == head->s->primary_key)
+  {
+   /*
+     A quick_select over a clustered primary key is handled specifically
+     Here we assume:
+     - PK columns are included in any other merged index
+     - Scan on the PK is disk-ordered.
+       (not meeting #2 will only cause performance degradation)
+
+       We could treat clustered PK as any other index, but that would
+       be inefficient. There is no point in doing scan on
+       CPK, remembering the rowid, then making rnd_pos() call with
+       that rowid.
+    */
     pk_quick_select= quick_sel_range;
-  else
-    return quick_selects.push_back(quick_sel_range);
-  return 0;
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(quick_selects.push_back(quick_sel_range));
 }
 
-QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT()
+QUICK_INDEX_SORT_SELECT::~QUICK_INDEX_SORT_SELECT()
 {
   List_iterator_fast<QUICK_RANGE_SELECT> quick_it(quick_selects);
   QUICK_RANGE_SELECT* quick;
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT");
+  DBUG_ENTER("QUICK_INDEX_SORT_SELECT::~QUICK_INDEX_SORT_SELECT");
   delete unique;
   quick_it.rewind();
   while ((quick= quick_it++))
@@ -1342,7 +1918,6 @@ QUICK_INDEX_MERGE_SELECT::~QUICK_INDEX_MERGE_SELECT()
   DBUG_VOID_RETURN;
 }
 
-
 QUICK_ROR_INTERSECT_SELECT::QUICK_ROR_INTERSECT_SELECT(THD *thd_param,
                                                        TABLE *table,
                                                        bool retrieve_full_rows,
@@ -1451,7 +2026,7 @@ int QUICK_RANGE_SELECT::init_ror_merged_scan(bool reuse_handler)
   if (init() || reset())
   {
     file->ha_external_lock(thd, F_UNLCK);
-    file->close();
+    file->ha_close();
     goto failure;
   }
   free_file= TRUE;
@@ -1501,15 +2076,17 @@ failure:
 */
 int QUICK_ROR_INTERSECT_SELECT::init_ror_merged_scan(bool reuse_handler)
 {
-  List_iterator_fast<QUICK_RANGE_SELECT> quick_it(quick_selects);
-  QUICK_RANGE_SELECT* quick;
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> quick_it(quick_selects);
+  QUICK_SELECT_WITH_RECORD *cur;
+  QUICK_RANGE_SELECT *quick;
   DBUG_ENTER("QUICK_ROR_INTERSECT_SELECT::init_ror_merged_scan");
 
   /* Initialize all merged "children" quick selects */
   DBUG_ASSERT(!need_to_fetch_row || reuse_handler);
   if (!need_to_fetch_row && reuse_handler)
   {
-    quick= quick_it++;
+    cur= quick_it++;
+    quick= cur->quick;
     /*
       There is no use of this->file. Use it for the first of merged range
       selects.
@@ -1518,8 +2095,9 @@ int QUICK_ROR_INTERSECT_SELECT::init_ror_merged_scan(bool reuse_handler)
       DBUG_RETURN(1);
     quick->file->extra(HA_EXTRA_KEYREAD_PRESERVE_FIELDS);
   }
-  while ((quick= quick_it++))
+  while ((cur= quick_it++))
   {
+    quick= cur->quick;
     if (quick->init_ror_merged_scan(FALSE))
       DBUG_RETURN(1);
     quick->file->extra(HA_EXTRA_KEYREAD_PRESERVE_FIELDS);
@@ -1551,10 +2129,10 @@ int QUICK_ROR_INTERSECT_SELECT::reset()
   if (!scans_inited && init_ror_merged_scan(TRUE))
     DBUG_RETURN(1);
   scans_inited= TRUE;
-  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
-  QUICK_RANGE_SELECT *quick;
-  while ((quick= it++))
-    quick->reset();
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+  QUICK_SELECT_WITH_RECORD *qr;
+  while ((qr= it++))
+    qr->quick->reset();
   DBUG_RETURN(0);
 }
 
@@ -1564,6 +2142,7 @@ int QUICK_ROR_INTERSECT_SELECT::reset()
 
   SYNOPSIS
     QUICK_ROR_INTERSECT_SELECT::push_quick_back()
+      alloc Mem root to create auxiliary structures on
       quick Quick select to be added. The quick select must return
             rows in rowid order.
   NOTES
@@ -1575,11 +2154,17 @@ int QUICK_ROR_INTERSECT_SELECT::reset()
 */
 
 bool
-QUICK_ROR_INTERSECT_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick)
+QUICK_ROR_INTERSECT_SELECT::push_quick_back(MEM_ROOT *alloc, QUICK_RANGE_SELECT *quick)
 {
-  return quick_selects.push_back(quick);
+  QUICK_SELECT_WITH_RECORD *qr;
+  if (!(qr= new QUICK_SELECT_WITH_RECORD) || 
+      !(qr->key_tuple= (uchar*)alloc_root(alloc, quick->max_used_key_length)))
+    return TRUE;
+  qr->quick= quick;
+  return quick_selects.push_back(qr);
 }
 
+
 QUICK_ROR_INTERSECT_SELECT::~QUICK_ROR_INTERSECT_SELECT()
 {
   DBUG_ENTER("QUICK_ROR_INTERSECT_SELECT::~QUICK_ROR_INTERSECT_SELECT");
@@ -1748,6 +2333,7 @@ SEL_ARG::SEL_ARG(SEL_ARG &arg) :Sql_alloc()
   min_value=arg.min_value;
   max_value=arg.max_value;
   next_key_part=arg.next_key_part;
+  max_part_no= arg.max_part_no;
   use_count=1; elements=1;
 }
 
@@ -1765,9 +2351,10 @@ SEL_ARG::SEL_ARG(Field *f,const uchar *min_value_arg,
   :min_flag(0), max_flag(0), maybe_flag(0), maybe_null(f->real_maybe_null()),
    elements(1), use_count(1), field(f), min_value((uchar*) min_value_arg),
    max_value((uchar*) max_value_arg), next(0),prev(0),
-   next_key_part(0),color(BLACK),type(KEY_RANGE)
+   next_key_part(0), color(BLACK), type(KEY_RANGE)
 {
   left=right= &null_element;
+  max_part_no= 1;
 }
 
 SEL_ARG::SEL_ARG(Field *field_,uint8 part_,
@@ -1778,6 +2365,7 @@ SEL_ARG::SEL_ARG(Field *field_,uint8 part_,
    field(field_), min_value(min_value_), max_value(max_value_),
    next(0),prev(0),next_key_part(0),color(BLACK),type(KEY_RANGE)
 {
+  max_part_no= part+1;
   left=right= &null_element;
 }
 
@@ -1821,6 +2409,7 @@ SEL_ARG *SEL_ARG::clone(RANGE_OPT_PARAM *param, SEL_ARG *new_parent,
   increment_use_count(1);
   tmp->color= color;
   tmp->elements= this->elements;
+  tmp->max_part_no= max_part_no;
   return tmp;
 }
 
@@ -2036,6 +2625,26 @@ public:
 
 
 /*
+  Plan for QUICK_INDEX_INTERSECT_SELECT scan.
+  QUICK_INDEX_INTERSECT_SELECT always retrieves full rows, so retrieve_full_rows
+  is ignored by make_quick.
+*/
+
+class TRP_INDEX_INTERSECT : public TABLE_READ_PLAN
+{
+public:
+  TRP_INDEX_INTERSECT() {}                        /* Remove gcc warning */
+  virtual ~TRP_INDEX_INTERSECT() {}               /* Remove gcc warning */
+  QUICK_SELECT_I *make_quick(PARAM *param, bool retrieve_full_rows,
+                             MEM_ROOT *parent_alloc);
+  TRP_RANGE **range_scans; /* array of ptrs to plans of intersected scans */
+  TRP_RANGE **range_scans_end; /* end of the array */
+  /* keys whose scans are to be filtered by cpk conditions */
+  key_map filtered_scans;  
+};
+
+
+/*
   Plan for QUICK_INDEX_MERGE_SELECT scan.
   QUICK_ROR_INTERSECT_SELECT always retrieves full rows, so retrieve_full_rows
   is ignored by make_quick.
@@ -2106,6 +2715,38 @@ public:
 };
 
 
+typedef struct st_index_scan_info
+{
+  uint      idx;      /* # of used key in param->keys */
+  uint      keynr;    /* # of used key in table */
+  uint      range_count;
+  ha_rows   records;  /* estimate of # records this scan will return */
+
+  /* Set of intervals over key fields that will be used for row retrieval. */
+  SEL_ARG   *sel_arg;
+
+  KEY *key_info;
+  uint used_key_parts;
+
+  /* Estimate of # records filtered out by intersection with cpk */
+  ha_rows   filtered_out;
+  /* Bitmap of fields used in index intersection */ 
+  MY_BITMAP used_fields;
+
+  /* Fields used in the query and covered by ROR scan. */
+  MY_BITMAP covered_fields;
+  uint      used_fields_covered; /* # of set bits in covered_fields */
+  int       key_rec_length; /* length of key record (including rowid) */
+
+  /*
+    Cost of reading all index records with values in sel_arg intervals set
+    (assuming there is no need to access full table records)
+  */
+  double    index_read_cost;
+  uint      first_uncovered_field; /* first unused bit in covered_fields */
+  uint      key_components; /* # of parts in the key */
+} INDEX_SCAN_INFO;
+
 /*
   Fill param->needed_fields with bitmap of fields used in the query.
   SYNOPSIS
@@ -2231,7 +2872,8 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
   quick=0;
   needed_reg.clear_all();
   quick_keys.clear_all();
-  if (keys_to_use.is_clear_all())
+  DBUG_ASSERT(!head->is_filled_at_execution());
+  if (keys_to_use.is_clear_all() || head->is_filled_at_execution())
     DBUG_RETURN(0);
   records= head->file->stats.records;
   if (!records)
@@ -2381,72 +3023,92 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
         It is possible to use a range-based quick select (but it might be
         slower than 'all' table scan).
       */
-      if (tree->merges.is_empty())
-      {
-        TRP_RANGE         *range_trp;
-        TRP_ROR_INTERSECT *rori_trp;
-        bool can_build_covering= FALSE;
+      TRP_RANGE         *range_trp;
+      TRP_ROR_INTERSECT *rori_trp;
+      TRP_INDEX_INTERSECT *intersect_trp;
+      bool can_build_covering= FALSE;
+      
+      remove_nonrange_trees(&param, tree);
 
-        /* Get best 'range' plan and prepare data for making other plans */
-        if ((range_trp= get_key_scans_params(&param, tree, FALSE, TRUE,
-                                             best_read_time)))
-        {
-          best_trp= range_trp;
-          best_read_time= best_trp->read_cost;
-        }
+      /* Get best 'range' plan and prepare data for making other plans */
+      if ((range_trp= get_key_scans_params(&param, tree, FALSE, TRUE,
+                                           best_read_time)))
+      {
+        best_trp= range_trp;
+        best_read_time= best_trp->read_cost;
+      }
 
+      /*
+        Simultaneous key scans and row deletes on several handler
+        objects are not allowed so don't use ROR-intersection for
+        table deletes.
+      */
+      if ((thd->lex->sql_command != SQLCOM_DELETE) && 
+           optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+      {
         /*
-          Simultaneous key scans and row deletes on several handler
-          objects are not allowed so don't use ROR-intersection for
-          table deletes.
+          Get best non-covering ROR-intersection plan and prepare data for
+          building covering ROR-intersection.
         */
-        if ((thd->lex->sql_command != SQLCOM_DELETE) && 
-             optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+        if ((rori_trp= get_best_ror_intersect(&param, tree, best_read_time,
+                                              &can_build_covering)))
         {
+          best_trp= rori_trp;
+          best_read_time= best_trp->read_cost;
           /*
-            Get best non-covering ROR-intersection plan and prepare data for
-            building covering ROR-intersection.
+            Try constructing covering ROR-intersect only if it looks possible
+            and worth doing.
           */
-          if ((rori_trp= get_best_ror_intersect(&param, tree, best_read_time,
-                                                &can_build_covering)))
-          {
+          if (!rori_trp->is_covering && can_build_covering &&
+              (rori_trp= get_best_covering_ror_intersect(&param, tree,
+                                                         best_read_time)))
             best_trp= rori_trp;
-            best_read_time= best_trp->read_cost;
-            /*
-              Try constructing covering ROR-intersect only if it looks possible
-              and worth doing.
-            */
-            if (!rori_trp->is_covering && can_build_covering &&
-                (rori_trp= get_best_covering_ror_intersect(&param, tree,
-                                                           best_read_time)))
-              best_trp= rori_trp;
-          }
         }
       }
-      else
+      /*
+        Do not look for an index intersection  plan if there is a covering
+        index. The scan by this covering index will be always cheaper than
+        any index intersection.
+      */
+      if (param.table->covering_keys.is_clear_all() &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE) &&
+          optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE_SORT_INTERSECT))
       {
-        if (optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+        if ((intersect_trp= get_best_index_intersect(&param, tree,
+                                                    best_read_time)))
         {
-          /* Try creating index_merge/ROR-union scan. */
-          SEL_IMERGE *imerge;
-          TABLE_READ_PLAN *best_conj_trp= NULL, *new_conj_trp;
-          LINT_INIT(new_conj_trp); /* no empty index_merge lists possible */
-          DBUG_PRINT("info",("No range reads possible,"
-                             " trying to construct index_merge"));
-          List_iterator_fast<SEL_IMERGE> it(tree->merges);
-          while ((imerge= it++))
+          best_trp= intersect_trp;
+          best_read_time= best_trp->read_cost; 
+          set_if_smaller(param.table->quick_condition_rows, 
+                         intersect_trp->records);
+        }
+      }
+
+      if (optimizer_flag(thd, OPTIMIZER_SWITCH_INDEX_MERGE))
+      {
+        /* Try creating index_merge/ROR-union scan. */
+        SEL_IMERGE *imerge;
+        TABLE_READ_PLAN *best_conj_trp= NULL, *new_conj_trp;
+        LINT_INIT(new_conj_trp); /* no empty index_merge lists possible */
+        DBUG_PRINT("info",("No range reads possible,"
+                           " trying to construct index_merge"));
+        List_iterator_fast<SEL_IMERGE> it(tree->merges);
+        while ((imerge= it++))
+        {
+          new_conj_trp= get_best_disjunct_quick(&param, imerge, best_read_time);
+          if (new_conj_trp)
+            set_if_smaller(param.table->quick_condition_rows, 
+                           new_conj_trp->records);
+          if (new_conj_trp &&
+              (!best_conj_trp || 
+               new_conj_trp->read_cost < best_conj_trp->read_cost))
           {
-            new_conj_trp= get_best_disjunct_quick(&param, imerge, best_read_time);
-            if (new_conj_trp)
-              set_if_smaller(param.table->quick_condition_rows, 
-                             new_conj_trp->records);
-            if (!best_conj_trp || (new_conj_trp && new_conj_trp->read_cost <
-                                   best_conj_trp->read_cost))
-              best_conj_trp= new_conj_trp;
+            best_conj_trp= new_conj_trp;
+            best_read_time= best_conj_trp->read_cost;
           }
-          if (best_conj_trp)
-            best_trp= best_conj_trp;
         }
+        if (best_conj_trp)
+          best_trp= best_conj_trp;
       }
     }
 
@@ -3743,11 +4405,19 @@ double get_sweep_read_cost(const PARAM *param, ha_rows records)
   DBUG_ENTER("get_sweep_read_cost");
   if (param->table->file->primary_key_is_clustered())
   {
+    /*
+      We are using the primary key to find the rows.
+      Calculate the cost for this.
+    */
     result= param->table->file->read_time(param->table->s->primary_key,
                                           (uint)records, records);
   }
   else
   {
+    /*
+      Rows will be retreived with rnd_pos(). Caluclate the expected
+      cost for this.
+    */
     double n_blocks=
       ceil(ulonglong2double(param->table->file->stats.data_file_length) /
            IO_SIZE);
@@ -3764,7 +4434,7 @@ double get_sweep_read_cost(const PARAM *param, ha_rows records)
       return 1;
     */
     JOIN *join= param->thd->lex->select_lex.join;
-    if (!join || join->tables == 1)
+    if (!join || join->table_count == 1)
     {
       /* No join, assume reading is done in one 'sweep' */
       result= busy_blocks*(DISK_SEEK_BASE_COST +
@@ -3855,7 +4525,6 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
 {
   SEL_TREE **ptree;
   TRP_INDEX_MERGE *imerge_trp= NULL;
-  uint n_child_scans= imerge->trees_next - imerge->trees;
   TRP_RANGE **range_scans;
   TRP_RANGE **cur_child;
   TRP_RANGE **cpk_scan= NULL;
@@ -3875,6 +4544,24 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
   DBUG_ENTER("get_best_disjunct_quick");
   DBUG_PRINT("info", ("Full table scan cost: %g", read_time));
 
+  /*
+    In every tree of imerge remove SEL_ARG trees that do not make ranges.
+    If after this removal some SEL_ARG tree becomes empty discard imerge.  
+  */
+  for (ptree= imerge->trees; ptree != imerge->trees_next; ptree++)
+  {
+    if (remove_nonrange_trees(param, *ptree))
+    {
+      imerge->trees_next= imerge->trees;
+      break;
+    }
+  }
+
+  uint n_child_scans= imerge->trees_next - imerge->trees;
+  
+  if (!n_child_scans)
+    DBUG_RETURN(NULL);
+
   if (!(range_scans= (TRP_RANGE**)alloc_root(param->mem_root,
                                              sizeof(TRP_RANGE*)*
                                              n_child_scans)))
@@ -3979,7 +4666,9 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
   imerge_cost +=
     Unique::get_use_cost(param->imerge_cost_buff, (uint)non_cpk_scan_records,
                          param->table->file->ref_length,
-                         param->thd->variables.sortbuff_size);
+                         param->thd->variables.sortbuff_size,
+                         TIME_FOR_COMPARE_ROWID,
+                         FALSE, NULL);
   DBUG_PRINT("info",("index_merge total cost: %g (wanted: less then %g)",
                      imerge_cost, read_time));
   if (imerge_cost < read_time)
@@ -3994,6 +4683,13 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge,
       imerge_trp->range_scans_end= range_scans + n_child_scans;
       read_time= imerge_cost;
     }
+    if (imerge_trp)
+    {
+      TABLE_READ_PLAN *trp= merge_same_index_scans(param, imerge, imerge_trp,
+                                                   read_time);
+      if (trp != imerge_trp)
+        DBUG_RETURN(trp);
+    }
   }
 
 build_ror_index_merge:
@@ -4009,6 +4705,7 @@ build_ror_index_merge:
                                         sizeof(TABLE_READ_PLAN*)*
                                         n_child_scans)))
     DBUG_RETURN(imerge_trp);
+
 skip_to_ror_scan:
   roru_index_costs= 0.0;
   roru_total_records= 0;
@@ -4092,30 +4789,990 @@ skip_to_ror_scan:
       DBUG_RETURN(roru);
     }
   }
-  DBUG_RETURN(imerge_trp);
+    DBUG_RETURN(imerge_trp);
 }
 
-typedef struct st_ror_scan_info
+
+/*
+  Merge index scans for the same indexes in an index merge plan
+
+  SYNOPSIS
+    merge_same_index_scans()
+      param           Context info for the operation
+      imerge   IN/OUT SEL_IMERGE from which imerge_trp has been extracted          
+      imerge_trp      The index merge plan where index scans for the same
+                      indexes are to be merges
+      read_time       The upper bound for the cost of the plan to be evaluated
+
+  DESRIPTION
+    For the given index merge plan imerge_trp extracted from the SEL_MERGE
+    imerge the function looks for range scans with the same indexes and merges
+    them into SEL_ARG trees. Then for each such SEL_ARG tree r_i the function
+    creates a range tree rt_i that contains only r_i. All rt_i are joined
+    into one index merge that replaces the original index merge imerge.
+    The function calls get_best_disjunct_quick for the new index merge to
+    get a new index merge plan that contains index scans only for different
+    indexes.
+    If there are no index scans for the same index in the original index
+    merge plan the function does not change the original imerge and returns
+    imerge_trp as its result.
+
+  RETURN
+    The original or or improved index merge plan                        
+*/
+
+static
+TABLE_READ_PLAN *merge_same_index_scans(PARAM *param, SEL_IMERGE *imerge,
+                                        TRP_INDEX_MERGE *imerge_trp,
+                                        double read_time)
 {
-  uint      idx;      /* # of used key in param->keys */
-  uint      keynr;    /* # of used key in table */
-  ha_rows   records;  /* estimate of # records this scan will return */
+  uint16 first_scan_tree_idx[MAX_KEY];
+  SEL_TREE **tree;
+  TRP_RANGE **cur_child;
+  uint removed_cnt= 0;
 
-  /* Set of intervals over key fields that will be used for row retrieval. */
-  SEL_ARG   *sel_arg;
+  DBUG_ENTER("merge_same_index_scans");
 
-  /* Fields used in the query and covered by this ROR scan. */
-  MY_BITMAP covered_fields;
-  uint      used_fields_covered; /* # of set bits in covered_fields */
-  int       key_rec_length; /* length of key record (including rowid) */
+  bzero(first_scan_tree_idx, sizeof(first_scan_tree_idx[0])*param->keys);
 
-  /*
-    Cost of reading all index records with values in sel_arg intervals set
-    (assuming there is no need to access full table records)
-  */
-  double    index_read_cost;
-  uint      first_uncovered_field; /* first unused bit in covered_fields */
-  uint      key_components; /* # of parts in the key */
+  for (tree= imerge->trees, cur_child= imerge_trp->range_scans;
+       tree != imerge->trees_next;
+       tree++, cur_child++)
+  {
+    DBUG_ASSERT(tree);
+    uint key_idx= (*cur_child)->key_idx;
+    uint16 *tree_idx_ptr= &first_scan_tree_idx[key_idx];
+    if (!*tree_idx_ptr)
+      *tree_idx_ptr= (uint16) (tree-imerge->trees+1);
+    else
+    {
+      SEL_TREE **changed_tree= imerge->trees+(*tree_idx_ptr-1);
+      SEL_ARG *key= (*changed_tree)->keys[key_idx];
+      bzero((*changed_tree)->keys,
+            sizeof((*changed_tree)->keys[0])*param->keys);
+      (*changed_tree)->keys_map.clear_all();
+      if (((*changed_tree)->keys[key_idx]=
+             key_or(param, key, (*tree)->keys[key_idx])))
+        (*changed_tree)->keys_map.set_bit(key_idx);
+      *tree= NULL;
+      removed_cnt++;
+    }
+  }
+  if (!removed_cnt)
+    DBUG_RETURN(imerge_trp);
+
+  TABLE_READ_PLAN *trp= NULL;
+  SEL_TREE **new_trees_next= imerge->trees;
+  for (tree= new_trees_next; tree != imerge->trees_next; tree++)
+  {
+    if (!*tree)
+      continue;
+    if (tree > new_trees_next)
+      *new_trees_next= *tree;
+    new_trees_next++;
+  }
+  imerge->trees_next= new_trees_next;
+
+  DBUG_ASSERT(imerge->trees_next>imerge->trees);
+
+  if (imerge->trees_next-imerge->trees > 1)
+    trp= get_best_disjunct_quick(param, imerge, read_time);
+  else
+  {
+    /*
+      This alternative theoretically can be reached when the cost
+      of the index merge for such a formula as
+        (key1 BETWEEN c1_1 AND c1_2) AND key2 > c2 OR
+        (key1 BETWEEN c1_3 AND c1_4) AND key3 > c3
+      is estimated as being cheaper than the cost of index scan for
+      the formula
+        (key1 BETWEEN c1_1 AND c1_2) OR (key1 BETWEEN c1_3 AND c1_4)
+      
+      In the current code this may happen for two reasons:
+      1. for a single index range scan data records are accessed in
+         a random order
+      2. the functions that estimate the cost of a range scan and an
+         index merge retrievals are not well calibrated
+    */
+    trp= get_key_scans_params(param, *imerge->trees, FALSE, TRUE,
+                              read_time);
+  }
+
+  DBUG_RETURN(trp); 
+}
+
+
+/*
+  This structure contains the info common for all steps of a partial
+  index intersection plan. Morever it contains also the info common
+  for index intersect plans. This info is filled in by the function
+  prepare_search_best just before searching for the best index
+  intersection plan.
+*/  
+
+typedef struct st_common_index_intersect_info
+{
+  PARAM *param;           /* context info for range optimizations            */
+  uint key_size;          /* size of a ROWID element stored in Unique object */
+  uint compare_factor;         /* 1/compare - cost to compare two ROWIDs     */
+  ulonglong max_memory_size;   /* maximum space allowed for Unique objects   */   
+  ha_rows table_cardinality;   /* estimate of the number of records in table */
+  double cutoff_cost;        /* discard index intersects with greater costs  */ 
+  INDEX_SCAN_INFO *cpk_scan;  /* clustered primary key used in intersection  */
+
+  bool in_memory;  /* unique object for intersection is completely in memory */
+
+  INDEX_SCAN_INFO **search_scans;    /* scans possibly included in intersect */ 
+  uint n_search_scans;               /* number of elements in search_scans   */
+
+  bool best_uses_cpk;   /* current best intersect uses clustered primary key */
+  double best_cost;       /* cost of the current best index intersection     */
+  /* estimate of the number of records in the current best intersection      */
+  ha_rows best_records;
+  uint best_length;    /* number of indexes in the current best intersection */
+  INDEX_SCAN_INFO **best_intersect;  /* the current best index intersection  */
+  /* scans from the best intersect to be filtrered by cpk conditions         */
+  key_map filtered_scans; 
+
+  uint *buff_elems;        /* buffer to calculate cost of index intersection */
+  
+} COMMON_INDEX_INTERSECT_INFO;
+
+
+/*
+  This structure contains the info specific for one step of an index
+  intersection plan. The structure is filled in by the function 
+   check_index_intersect_extension.
+*/
+
+typedef struct st_partial_index_intersect_info
+{
+  COMMON_INDEX_INTERSECT_INFO *common_info;    /* shared by index intersects */
+  uint length;         /* number of index scans in the partial intersection  */
+  ha_rows records;     /* estimate of the number of records in intersection  */
+  double cost;         /* cost of the partial index intersection             */
+
+  /* estimate of total number of records of all scans of the partial index
+     intersect sent to the Unique object used for the intersection  */
+  ha_rows records_sent_to_unique;
+
+  /* total cost of the scans of indexes from the partial index intersection  */
+  double index_read_cost; 
+
+  bool use_cpk_filter;      /* cpk filter is to be used for this       scan  */  
+  bool in_memory;            /* uses unique object in memory                 */
+  double in_memory_cost;     /* cost of using unique object in memory        */
+
+  key_map filtered_scans;    /* scans to be filtered by cpk conditions       */
+         
+  MY_BITMAP *intersect_fields;     /* bitmap of fields used in intersection  */
+} PARTIAL_INDEX_INTERSECT_INFO;
+
+
+/* Check whether two indexes have the same first n components */
+
+static
+bool same_index_prefix(KEY *key1, KEY *key2, uint used_parts)
+{
+  KEY_PART_INFO *part1= key1->key_part;
+  KEY_PART_INFO *part2= key2->key_part;
+  for(uint i= 0; i < used_parts; i++, part1++, part2++)
+  {
+    if (part1->fieldnr != part2->fieldnr)
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* Create a bitmap for all fields of a table */
+
+static
+bool create_fields_bitmap(PARAM *param, MY_BITMAP *fields_bitmap)
+{
+  my_bitmap_map *bitmap_buf;
+
+  if (!(bitmap_buf= (my_bitmap_map *) alloc_root(param->mem_root,
+                                                 param->fields_bitmap_size)))
+    return TRUE;
+  if (bitmap_init(fields_bitmap, bitmap_buf, param->table->s->fields, FALSE))
+    return TRUE;
+  
+  return FALSE;
+}
+
+/* Compare two indexes scans for sort before search for the best intersection */
+
+static
+int cmp_intersect_index_scan(INDEX_SCAN_INFO **a, INDEX_SCAN_INFO **b)
+{
+  return (*a)->records < (*b)->records ?
+          -1 : (*a)->records == (*b)->records ? 0 : 1;
+}
+
+
+static inline
+void set_field_bitmap_for_index_prefix(MY_BITMAP *field_bitmap,
+                                       KEY_PART_INFO *key_part,
+                                       uint used_key_parts)
+{
+  bitmap_clear_all(field_bitmap);
+  for (KEY_PART_INFO *key_part_end= key_part+used_key_parts;
+       key_part < key_part_end; key_part++)
+  {
+    bitmap_set_bit(field_bitmap, key_part->fieldnr-1);
+  }
+}
+
+
+/*
+  Round up table cardinality read from statistics provided by engine.
+  This function should go away when mysql test will allow to handle
+  more or less easily in the test suites deviations of InnoDB 
+  statistical data.
+*/
+ 
+static inline
+ha_rows get_table_cardinality_for_index_intersect(TABLE *table)
+{
+  if (table->file->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT)
+    return table->file->stats.records;
+  else
+  {
+    ha_rows d;
+    double q;
+    for (q= (double)table->file->stats.records, d= 1 ; q >= 10; q/= 10, d*= 10 ) ;
+    return (ha_rows) (floor(q+0.5) * d);
+  } 
+}
+
+  
+static
+ha_rows records_in_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                             INDEX_SCAN_INFO *ext_index_scan);
+
+/*
+  Prepare to search for the best index intersection
+
+  SYNOPSIS
+    prepare_search_best_index_intersect()
+      param         common info about index ranges
+      tree          tree of ranges for indexes than can be intersected
+      common    OUT info needed for search to be filled by the function 
+      init      OUT info for an initial pseudo step of the intersection plans
+      cutoff_cost   cut off cost of the interesting index intersection 
+
+  DESCRIPTION
+    The function initializes all fields of the structure 'common' to be used
+    when searching for the best intersection plan. It also allocates
+    memory to store the most cheap index intersection.
+
+  NOTES
+    When selecting candidates for index intersection we always take only
+    one representative out of any set of indexes that share the same range
+    conditions. These indexes always have the same prefixes and the
+    components of this prefixes are exactly those used in these range
+    conditions.
+    Range conditions over clustered primary key (cpk) is always used only
+    as the condition that filters out some rowids retrieved by the scans
+    for secondary indexes. The cpk index will be handled in special way by
+    the function that search for the best index intersection. 
+
+  RETURN
+    FALSE  in the case of success
+    TRUE   otherwise
+*/
+
+static
+bool prepare_search_best_index_intersect(PARAM *param, 
+                                         SEL_TREE *tree,
+                                         COMMON_INDEX_INTERSECT_INFO *common,
+                                         PARTIAL_INDEX_INTERSECT_INFO *init,
+                                         double cutoff_cost)
+{
+  uint i;
+  uint n_search_scans;
+  double cost;
+  INDEX_SCAN_INFO **index_scan;
+  INDEX_SCAN_INFO **scan_ptr;
+  INDEX_SCAN_INFO *cpk_scan= NULL;
+  TABLE *table= param->table;
+  uint n_index_scans= tree->index_scans_end - tree->index_scans;
+
+  if (!n_index_scans)
+    return 1;
+
+  bzero(init, sizeof(*init));
+  init->common_info= common;
+  init->cost= cutoff_cost;
+
+  common->param= param;
+  common->key_size= table->file->ref_length;
+  common->compare_factor= TIME_FOR_COMPARE_ROWID;
+  common->max_memory_size= param->thd->variables.sortbuff_size;
+  common->cutoff_cost= cutoff_cost;
+  common->cpk_scan= NULL;
+  common->table_cardinality= 
+    get_table_cardinality_for_index_intersect(table);
+
+  if (n_index_scans <= 1)
+    return TRUE;
+
+  if (table->file->primary_key_is_clustered())
+  {
+    INDEX_SCAN_INFO **index_scan_end;
+    index_scan= tree->index_scans;
+    index_scan_end= index_scan+n_index_scans;
+    for ( ; index_scan < index_scan_end; index_scan++)
+    {  
+      if ((*index_scan)->keynr == table->s->primary_key)
+      {
+        common->cpk_scan= cpk_scan= *index_scan;
+        break;
+      }
+    }
+  }
+
+  i= n_index_scans - test(cpk_scan != NULL) + 1;
+
+  if (!(common->search_scans =
+	(INDEX_SCAN_INFO **) alloc_root (param->mem_root,
+                                         sizeof(INDEX_SCAN_INFO *) * i)))
+    return TRUE;
+  bzero(common->search_scans, sizeof(INDEX_SCAN_INFO *) * i);
+
+  INDEX_SCAN_INFO **selected_index_scans= common->search_scans;
+    
+  for (i=0, index_scan= tree->index_scans; i < n_index_scans; i++, index_scan++)
+  {
+    uint used_key_parts= (*index_scan)->used_key_parts;
+    KEY *key_info= (*index_scan)->key_info;
+
+    if (*index_scan == cpk_scan)
+      continue;
+    if (cpk_scan && cpk_scan->used_key_parts >= used_key_parts &&
+        same_index_prefix(cpk_scan->key_info, key_info, used_key_parts))
+      continue;
+
+    cost= table->file->keyread_time((*index_scan)->keynr,
+                                    (*index_scan)->range_count,
+                                    (*index_scan)->records);
+    if (cost >= cutoff_cost)
+      continue;
+   
+    for (scan_ptr= selected_index_scans; *scan_ptr ; scan_ptr++)
+    {
+      /*
+        When we have range conditions for two different indexes with the same
+        beginning it does not make sense to consider both of them for index 
+        intersection if the range conditions are covered by common initial
+        components of the indexes. Actually in this case the indexes are
+        guaranteed to have the same range conditions.
+      */
+      if ((*scan_ptr)->used_key_parts == used_key_parts &&
+          same_index_prefix((*scan_ptr)->key_info, key_info, used_key_parts))
+        break;
+    }
+    if (!*scan_ptr || cost < (*scan_ptr)->index_read_cost)
+    {
+      *scan_ptr= *index_scan;
+      (*scan_ptr)->index_read_cost= cost;
+    }
+  } 
+
+  ha_rows records_in_scans= 0;
+
+  for (scan_ptr=selected_index_scans, i= 0; *scan_ptr; scan_ptr++, i++)
+  {
+    if (create_fields_bitmap(param, &(*scan_ptr)->used_fields))
+      return TRUE;
+    records_in_scans+= (*scan_ptr)->records;
+  }
+  n_search_scans= i;
+
+  if (cpk_scan && create_fields_bitmap(param, &cpk_scan->used_fields))
+    return TRUE;
+  
+  if (!(common->n_search_scans= n_search_scans))
+    return TRUE;
+    
+  common->best_uses_cpk= FALSE;
+  common->best_cost= cutoff_cost + COST_EPS;
+  common->best_length= 0;
+
+  if (!(common->best_intersect=
+	(INDEX_SCAN_INFO **) alloc_root (param->mem_root,
+                                         sizeof(INDEX_SCAN_INFO *) *
+                                         (i + test(cpk_scan != NULL)))))
+    return TRUE;
+
+  size_t calc_cost_buff_size=
+         Unique::get_cost_calc_buff_size((size_t)records_in_scans,
+                                         common->key_size,
+				         common->max_memory_size);
+  if (!(common->buff_elems= (uint *) alloc_root(param->mem_root,
+                                                calc_cost_buff_size)))
+    return TRUE;
+
+  my_qsort(selected_index_scans, n_search_scans, sizeof(INDEX_SCAN_INFO *),
+           (qsort_cmp) cmp_intersect_index_scan);
+
+  if (cpk_scan)
+  {
+    PARTIAL_INDEX_INTERSECT_INFO curr;
+    set_field_bitmap_for_index_prefix(&cpk_scan->used_fields,
+                                      cpk_scan->key_info->key_part,
+                                      cpk_scan->used_key_parts);
+    curr.common_info= common;
+    curr.intersect_fields= &cpk_scan->used_fields;
+    curr.records= cpk_scan->records;
+    curr.length= 1;
+    for (scan_ptr=selected_index_scans; *scan_ptr; scan_ptr++)
+    {
+      ha_rows scan_records= (*scan_ptr)->records;
+      ha_rows records= records_in_index_intersect_extension(&curr, *scan_ptr);
+      (*scan_ptr)->filtered_out= records >= scan_records ?
+                                   0 : scan_records-records; 
+    }
+  } 
+  else
+  {
+    for (scan_ptr=selected_index_scans; *scan_ptr; scan_ptr++)
+      (*scan_ptr)->filtered_out= 0;
+  }
+
+  return FALSE;
+}
+
+
+/*
+  On Estimation of the Number of Records in an Index Intersection 
+  ===============================================================
+
+  Consider query Q over table t. Let C be the WHERE condition of  this query,
+  and, idx1(a1_1,...,a1_k1) and idx2(a2_1,...,a2_k2) be some indexes defined
+  on table t.
+  Let rt1 and rt2 be the range trees extracted by the range optimizer from C
+  for idx1 and idx2 respectively.
+  Let #t be the estimate of the number of records in table t provided for the
+  optimizer. 
+  Let #r1 and #r2 be the estimates of the number of records in the range trees
+  rt1 and rt2, respectively, obtained by the range optimizer.
+
+  We need to get an estimate for the number of records in the index 
+  intersection of rt1 and rt2. In other words, we need to estimate the
+  cardinality of the set of records that are in both trees. Let's designate
+  this number by #r.
+
+  If we do not make any assumptions then we can only state that
+     #r<=min(#r1,#r2).
+  With this estimate we can't say that the index intersection scan will be 
+  cheaper than the cheapest index scan.
+
+  Let Rt1 and Rt2 be AND/OR conditions representing rt and rt2 respectively.
+  The probability that a record belongs to rt1 is sel(Rt1)=#r1/#t.
+  The probability that a record belongs to rt2 is sel(Rt2)=#r2/#t.
+
+  If we assume that the values in columns of idx1 and idx2 are independent
+  then #r/#t=sel(Rt1&Rt2)=sel(Rt1)*sel(Rt2)=(#r1/#t)*(#r2/#t).
+  So in this case we have: #r=#r1*#r2/#t.
+
+  The above assumption of independence of the columns in idx1 and idx2 means
+  that:
+  - all columns are different
+  - values from one column do not correlate with values from any other column.
+
+  We can't help with the case when column correlate with each other.
+  Yet, if they are assumed to be uncorrelated the value of #r theoretically can
+  be evaluated . Unfortunately this evaluation, in general, is rather complex.
+
+  Let's consider two indexes idx1:(dept, manager),  idx2:(dept, building)
+  over table 'employee' and two range conditions over these indexes:
+    Rt1: dept=10 AND manager LIKE 'S%'
+    Rt2: dept=10 AND building LIKE 'L%'.
+  We can state that:
+    sel(Rt1&Rt2)=sel(dept=10)*sel(manager LIKE 'S%')*sel(building LIKE 'L%')
+    =sel(Rt1)*sel(Rt2)/sel(dept=10).
+  sel(Rt1/2_0:dept=10) can be estimated if we know the cardinality #r1_0 of
+  the range for sub-index idx1_0 (dept) of the index idx1 or the cardinality
+  #rt2_0 of the same range for sub-index idx2_0(dept) of the index idx2.
+  The current code does not make an estimate either for #rt1_0, or for #rt2_0,
+  but it can be adjusted to provide those numbers.
+  Alternatively, min(rec_per_key) for (dept) could be used to get an upper 
+  bound for the value of sel(Rt1&Rt2). Yet this statistics is not provided
+  now.  
+ 
+  Let's consider two other indexes idx1:(dept, last_name), 
+  idx2:(first_name, last_name) and two range conditions over these indexes:
+    Rt1: dept=5 AND last_name='Sm%'
+    Rt2: first_name='Robert' AND last_name='Sm%'.
+
+  sel(Rt1&Rt2)=sel(dept=5)*sel(last_name='Sm5')*sel(first_name='Robert')
+  =sel(Rt2)*sel(dept=5)
+  Here max(rec_per_key) for (dept) could be used to get an upper bound for
+  the value of sel(Rt1&Rt2).
+  
+  When the intersected indexes have different major columns, but some
+  minor column are common the picture may be more complicated.
+
+  Let's consider the following range conditions for the same indexes as in
+  the previous example:
+    Rt1: (Rt11: dept=5 AND last_name='So%') 
+         OR 
+         (Rt12: dept=7 AND last_name='Saw%')
+    Rt2: (Rt21: first_name='Robert' AND last_name='Saw%')
+         OR
+         (Rt22: first_name='Bob' AND last_name='So%')
+  Here we have:
+  sel(Rt1&Rt2)= sel(Rt11)*sel(Rt21)+sel(Rt22)*sel(dept=5) +
+                sel(Rt21)*sel(dept=7)+sel(Rt12)*sel(Rt22)
+  Now consider the range condition:
+    Rt1_0: (dept=5 OR dept=7)
+  For this condition we can state that:
+  sel(Rt1_0&Rt2)=(sel(dept=5)+sel(dept=7))*(sel(Rt21)+sel(Rt22))=
+  sel(dept=5)*sel(Rt21)+sel(dept=7)*sel(Rt21)+
+  sel(dept=5)*sel(Rt22)+sel(dept=7)*sel(Rt22)=
+  sel(dept=5)*sel(Rt21)+sel(Rt21)*sel(dept=7)+
+  sel(Rt22)*sel(dept=5)+sel(dept=7)*sel(Rt22) >
+  sel(Rt11)*sel(Rt21)+sel(Rt22)*sel(dept=5)+
+  sel(Rt21)*sel(dept=7)+sel(Rt12)*sel(Rt22) >
+  sel(Rt1 & Rt2) 
+
+ We've just demonstrated for an example what is intuitively almost obvious
+ in general. We can  remove the ending parts fromrange trees getting less
+ selective range conditions for sub-indexes.
+ So if not a most major component with the number k of an index idx is
+ encountered in the index with which we intersect we can use the sub-index
+ idx_k-1 that includes the components of idx up to the i-th component and
+ the range tree for idx_k-1 to make an upper bound estimate for the number
+  of records in the index intersection.
+ The range tree for idx_k-1 we use here is the subtree of the original range
+  tree for idx that contains only parts from the first k-1 components.
+
+  As it was mentioned above the range optimizer currently does not provide
+  an estimate for the number of records in the ranges for sub-indexes.
+  However, some reasonable upper bound estimate can be obtained.
+
+  Let's consider the following range tree:
+    Rt: (first_name='Robert' AND last_name='Saw%')
+        OR
+        (first_name='Bob' AND last_name='So%')
+  Let #r be the number of records in Rt. Let f_1 be the fan-out of column
+  last_name:
+    f_1 = rec_per_key[first_name]/rec_per_key[last_name].
+  The the number of records in the range tree:
+    Rt_0:  (first_name='Robert' OR first_name='Bob')
+  for the sub-index (first_name) is not greater than max(#r*f_1, #t).
+  Strictly speaking, we can state only that it's not greater than 
+  max(#r*max_f_1, #t), where
+    max_f_1= max_rec_per_key[first_name]/min_rec_per_key[last_name].
+  Yet, if #r/#t is big enough (and this is the case of an index intersection,
+  because using this index range with a single index scan is cheaper than
+  the cost of the intersection when #r/#t is small) then almost safely we
+  can use here f_1 instead of max_f_1.
+
+  The above considerations can be used in future development. Now, they are
+  used partly in the function that provides a rough upper bound estimate for
+  the number of records in an index intersection that follow below.
+*/
+
+/*
+  Estimate the number of records selected by an extension a partial intersection
+
+  SYNOPSIS
+    records_in_index_intersect_extension()
+     curr            partial intersection plan to be extended
+     ext_index_scan  the evaluated extension of this partial plan
+
+  DESCRIPTION
+    The function provides an estimate for the number of records in the
+    intersection of the partial index intersection curr with the index
+    ext_index_scan. If all intersected indexes does not have common columns
+    then  the function returns an exact estimate (assuming there are no
+    correlations between values in the columns). If the intersected indexes
+    have common  columns the function returns an upper bound for the number
+    of records in the intersection provided that the intersection of curr
+    with ext_index_scan can is expected to have less records than the expected
+    number of records in the partial intersection curr. In this case the
+    function also assigns the bitmap of the columns in the extended 
+    intersection to ext_index_scan->used_fields.
+    If the function cannot expect that the number of records in the extended
+    intersection is less that the expected number of records #r in curr then
+    the function returns a number bigger than #r.
+
+  NOTES
+   See the comment before the desription of the function that explains the
+   reasoning used  by this function.
+    
+  RETURN
+    The expected number of rows in the extended index intersection
+*/
+
+static
+ha_rows records_in_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                             INDEX_SCAN_INFO *ext_index_scan)
+{
+  KEY *key_info= ext_index_scan->key_info;
+  KEY_PART_INFO* key_part= key_info->key_part;
+  uint used_key_parts= ext_index_scan->used_key_parts;
+  MY_BITMAP *used_fields= &ext_index_scan->used_fields;
+  
+  if (!curr->length)
+  {
+    /* 
+      If this the first index in the intersection just mark the
+      fields in the used_fields bitmap and return the expected
+      number of records in the range scan for the index provided
+      by the range optimizer.
+    */ 
+    set_field_bitmap_for_index_prefix(used_fields, key_part, used_key_parts);
+    return ext_index_scan->records;
+  }
+
+  uint i;
+  bool better_selectivity= FALSE;
+  ha_rows records= curr->records;
+  MY_BITMAP *curr_intersect_fields= curr->intersect_fields; 
+  for (i= 0; i < used_key_parts; i++, key_part++)
+  {
+    if (bitmap_is_set(curr_intersect_fields, key_part->fieldnr-1))
+      break;
+  }
+  if (i)
+  {
+    ha_rows table_cardinality= curr->common_info->table_cardinality;
+    ha_rows ext_records= ext_index_scan->records;
+    if (i < used_key_parts)
+    {
+      ulong *rec_per_key= key_info->rec_per_key+i-1;
+      ulong f1= rec_per_key[0] ? rec_per_key[0] : 1;
+      ulong f2= rec_per_key[1] ? rec_per_key[1] : 1;
+      ext_records= (ha_rows) ((double) ext_records / f2 * f1);
+    }
+    if (ext_records < table_cardinality)
+    {
+      better_selectivity= TRUE;
+      records= (ha_rows) ((double) records / table_cardinality *
+			  ext_records);
+      bitmap_copy(used_fields, curr_intersect_fields);
+      key_part= key_info->key_part;
+      for (uint j= 0; j < used_key_parts; j++, key_part++)
+        bitmap_set_bit(used_fields, key_part->fieldnr-1);
+    }
+  }
+  return !better_selectivity ? records+1 :
+                               !records ? 1 : records;
+}
+
+
+/* 
+  Estimate the cost a binary search within disjoint cpk range intervals
+
+  Number of comparisons to check whether a cpk value satisfies
+  the cpk range condition = log2(cpk_scan->range_count).
+*/ 
+
+static inline
+double get_cpk_filter_cost(ha_rows filtered_records, 
+                           INDEX_SCAN_INFO *cpk_scan,
+                           double compare_factor)
+{
+  return log((double) (cpk_scan->range_count+1)) / (compare_factor * M_LN2) *
+           filtered_records;
+}
+
+
+/*
+  Check whether a patial index intersection plan can be extended 
+
+  SYNOPSIS
+    check_index_intersect_extension()
+     curr            partial intersection plan to be extended
+     ext_index_scan  a possible extension of this plan to be checked
+     next       OUT  the structure to be filled for the extended plan 
+
+  DESCRIPTION
+    The function checks whether it makes sense to extend the index
+    intersection plan adding the index ext_index_scan, and, if this
+    the case, the function fills in the structure for the extended plan.
+
+  RETURN
+    TRUE      if it makes sense to extend the given plan 
+    FALSE     otherwise
+*/
+
+static
+bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr,
+                                     INDEX_SCAN_INFO *ext_index_scan,
+                                     PARTIAL_INDEX_INTERSECT_INFO *next)
+{
+  ha_rows records;
+  ha_rows records_sent_to_unique;
+  double cost;
+  ha_rows ext_index_scan_records= ext_index_scan->records;
+  ha_rows records_filtered_out_by_cpk= ext_index_scan->filtered_out;
+  COMMON_INDEX_INTERSECT_INFO *common_info= curr->common_info;
+  double cutoff_cost= common_info->cutoff_cost;
+  uint idx= curr->length;
+  next->index_read_cost= curr->index_read_cost+ext_index_scan->index_read_cost;
+  if (next->index_read_cost > cutoff_cost)
+    return FALSE; 
+
+  if ((next->in_memory= curr->in_memory))
+    next->in_memory_cost= curr->in_memory_cost;
+
+  next->intersect_fields= &ext_index_scan->used_fields;
+  next->filtered_scans= curr->filtered_scans;
+
+  records_sent_to_unique= curr->records_sent_to_unique;
+
+  next->use_cpk_filter= FALSE;
+
+  /* Calculate the cost of using a Unique object for index intersection */
+  if (idx && next->in_memory)
+  { 
+    /* 
+      All rowids received from the first scan are expected in one unique tree
+    */
+    ha_rows elems_in_tree= common_info->search_scans[0]->records-
+                           common_info->search_scans[0]->filtered_out ;
+    next->in_memory_cost+= Unique::get_search_cost(elems_in_tree,
+                                                   common_info->compare_factor)* 
+                             ext_index_scan_records;
+    cost= next->in_memory_cost;
+  }
+  else
+  {
+    uint *buff_elems= common_info->buff_elems;
+    uint key_size= common_info->key_size;
+    uint compare_factor= common_info->compare_factor;         
+    ulonglong max_memory_size= common_info->max_memory_size; 
+    
+    records_sent_to_unique+= ext_index_scan_records;
+    cost= Unique::get_use_cost(buff_elems, (size_t) records_sent_to_unique, key_size,
+                               max_memory_size, compare_factor, TRUE,
+                               &next->in_memory);
+    if (records_filtered_out_by_cpk)
+    {
+      /* Check whether using cpk filter for this scan is beneficial */
+
+      double cost2;
+      bool in_memory2;
+      ha_rows records2= records_sent_to_unique-records_filtered_out_by_cpk;
+      cost2=  Unique::get_use_cost(buff_elems, (size_t) records2, key_size,
+                                   max_memory_size, compare_factor, TRUE,
+                                   &in_memory2);
+      cost2+= get_cpk_filter_cost(ext_index_scan_records, common_info->cpk_scan,
+                                  compare_factor);
+      if (cost > cost2 + COST_EPS)
+      {
+        cost= cost2;
+        next->in_memory= in_memory2;
+        next->use_cpk_filter= TRUE;
+        records_sent_to_unique= records2;
+      }
+
+    }   
+    if (next->in_memory)
+      next->in_memory_cost= cost;
+  }
+
+  if (next->use_cpk_filter)
+  {
+    next->filtered_scans.set_bit(ext_index_scan->keynr);
+    bitmap_union(&ext_index_scan->used_fields,
+                 &common_info->cpk_scan->used_fields);
+  }
+  next->records_sent_to_unique= records_sent_to_unique;
+       
+  records= records_in_index_intersect_extension(curr, ext_index_scan);
+  if (idx && records > curr->records)
+    return FALSE;
+  if (next->use_cpk_filter && curr->filtered_scans.is_clear_all())
+    records-= records_filtered_out_by_cpk;
+  next->records= records;
+
+  cost+= next->index_read_cost;
+  if (cost >= cutoff_cost)
+    return FALSE;
+
+  cost+= get_sweep_read_cost(common_info->param, records);
+
+  next->cost= cost;
+  next->length= curr->length+1;
+
+  return TRUE;
+}
+
+
+/*
+  Search for the cheapest extensions of range scans used to access a table    
+
+  SYNOPSIS
+    find_index_intersect_best_extension()
+      curr        partial intersection to evaluate all possible extension for 
+
+  DESCRIPTION
+    The function tries to extend the partial plan curr in all possible ways
+    to look for a cheapest index intersection whose cost less than the 
+    cut off value set in curr->common_info.cutoff_cost. 
+*/
+
+static 
+void find_index_intersect_best_extension(PARTIAL_INDEX_INTERSECT_INFO *curr)
+{
+  PARTIAL_INDEX_INTERSECT_INFO next;
+  COMMON_INDEX_INTERSECT_INFO *common_info= curr->common_info;
+  INDEX_SCAN_INFO **index_scans= common_info->search_scans;
+  uint idx= curr->length;
+  INDEX_SCAN_INFO **rem_first_index_scan_ptr= &index_scans[idx];
+  double cost= curr->cost;
+
+  if (cost + COST_EPS < common_info->best_cost)
+  {
+    common_info->best_cost= cost;
+    common_info->best_length= curr->length;
+    common_info->best_records= curr->records;
+    common_info->filtered_scans= curr->filtered_scans;
+    /* common_info->best_uses_cpk <=> at least one scan uses a cpk filter */
+    common_info->best_uses_cpk= !curr->filtered_scans.is_clear_all();
+    uint sz= sizeof(INDEX_SCAN_INFO *) * curr->length;
+    memcpy(common_info->best_intersect, common_info->search_scans, sz);
+    common_info->cutoff_cost= cost;
+  }   
+
+  if (!(*rem_first_index_scan_ptr))
+    return;  
+
+  next.common_info= common_info;
+ 
+  INDEX_SCAN_INFO *rem_first_index_scan= *rem_first_index_scan_ptr;
+  for (INDEX_SCAN_INFO **index_scan_ptr= rem_first_index_scan_ptr;
+       *index_scan_ptr; index_scan_ptr++)
+  {
+    *rem_first_index_scan_ptr= *index_scan_ptr;
+    *index_scan_ptr= rem_first_index_scan;
+    if (check_index_intersect_extension(curr, *rem_first_index_scan_ptr, &next))
+      find_index_intersect_best_extension(&next);
+    *index_scan_ptr= *rem_first_index_scan_ptr;
+    *rem_first_index_scan_ptr= rem_first_index_scan;
+  }
+}
+
+
+/*
+  Get the plan of the best intersection of range scans used to access a table    
+
+  SYNOPSIS
+    get_best_index_intersect()
+      param         common info about index ranges
+      tree          tree of ranges for indexes than can be intersected
+      read_time     cut off value for the evaluated plans 
+
+  DESCRIPTION
+    The function looks for the cheapest index intersection of the range
+    scans to access a table. The info about the ranges for all indexes
+    is provided by the range optimizer and is passed through the
+    parameters param and tree. Any plan whose cost is greater than read_time
+    is rejected. 
+    After the best index intersection is found the function constructs
+    the structure that manages the execution by the chosen plan.
+
+  RETURN
+    Pointer to the generated execution structure if a success,
+    0 - otherwise.
+*/
+
+static
+TRP_INDEX_INTERSECT *get_best_index_intersect(PARAM *param, SEL_TREE *tree,
+                                              double read_time)
+{
+  uint i;
+  uint count;
+  TRP_RANGE **cur_range;
+  TRP_RANGE **range_scans;
+  INDEX_SCAN_INFO *index_scan;
+  COMMON_INDEX_INTERSECT_INFO common;
+  PARTIAL_INDEX_INTERSECT_INFO init;
+  TRP_INDEX_INTERSECT *intersect_trp= NULL;
+  TABLE *table= param->table;
+  
+  
+  DBUG_ENTER("get_best_index_intersect");
+
+  if (prepare_search_best_index_intersect(param, tree, &common, &init,
+                                          read_time))
+    DBUG_RETURN(NULL);
+
+  find_index_intersect_best_extension(&init);
+
+  if (common.best_length <= 1 && !common.best_uses_cpk)
+    DBUG_RETURN(NULL);
+
+  if (common.best_uses_cpk)
+  {
+    memmove((char *) (common.best_intersect+1), (char *) common.best_intersect,
+            sizeof(INDEX_SCAN_INFO *) * common.best_length);
+    common.best_intersect[0]= common.cpk_scan;
+    common.best_length++;
+  }
+
+  count= common.best_length;
+
+  if (!(range_scans= (TRP_RANGE**)alloc_root(param->mem_root,
+                                            sizeof(TRP_RANGE *)*
+                                            count)))
+    DBUG_RETURN(NULL);
+
+  for (i= 0, cur_range= range_scans; i < count; i++)
+  {
+    index_scan= common.best_intersect[i];
+    if ((*cur_range= new (param->mem_root) TRP_RANGE(index_scan->sel_arg,
+                                                     index_scan->idx, 0)))
+    {  
+      TRP_RANGE *trp= *cur_range;  
+      trp->read_cost= index_scan->index_read_cost;  
+      trp->records= index_scan->records;        
+      trp->is_ror= FALSE;
+      trp->mrr_buf_size= 0;
+      table->intersect_keys.set_bit(index_scan->keynr);
+      cur_range++;
+    }
+  }
+  
+  count= tree->index_scans_end - tree->index_scans;
+  for (i= 0; i < count; i++)
+  {
+    index_scan= tree->index_scans[i]; 
+    if (!table->intersect_keys.is_set(index_scan->keynr))
+    {
+      for (uint j= 0; j < common.best_length; j++)
+      {
+	INDEX_SCAN_INFO *scan= common.best_intersect[j];
+        if (same_index_prefix(index_scan->key_info, scan->key_info,
+                              scan->used_key_parts))
+	{
+          table->intersect_keys.set_bit(index_scan->keynr);
+          break;
+        } 
+      }
+    }
+  }
+      
+  if ((intersect_trp= new (param->mem_root)TRP_INDEX_INTERSECT))
+  {
+    intersect_trp->read_cost= common.best_cost;
+    intersect_trp->records= common.best_records;
+    intersect_trp->range_scans= range_scans;
+    intersect_trp->range_scans_end= cur_range;
+    intersect_trp->filtered_scans= common.filtered_scans;
+  }
+  DBUG_RETURN(intersect_trp);
+}
+
+
+typedef struct st_ror_scan_info : INDEX_SCAN_INFO
+{ 
 } ROR_SCAN_INFO;
 
 
@@ -4151,7 +5808,7 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg)
   ror_scan->key_rec_length= (param->table->key_info[keynr].key_length +
                              param->table->file->ref_length);
   ror_scan->sel_arg= sel_arg;
-  ror_scan->records= param->table->quick_rows[keynr];
+  ror_scan->records= param->quick_rows[keynr];
 
   if (!(bitmap_buf= (my_bitmap_map*) alloc_root(param->mem_root,
                                                 param->fields_bitmap_size)))
@@ -4171,8 +5828,7 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg)
       bitmap_set_bit(&ror_scan->covered_fields, key_part->fieldnr-1);
   }
   ror_scan->index_read_cost=
-    param->table->file->keyread_time(ror_scan->keynr, 1,
-                                     param->table->quick_rows[ror_scan->keynr]);
+    param->table->file->keyread_time(ror_scan->keynr, 1, ror_scan->records);
   DBUG_RETURN(ror_scan);
 }
 
@@ -4457,7 +6113,7 @@ static double ror_scan_selectivity(const ROR_INTERSECT_INFO *info,
   }
   if (!prev_covered)
   {
-    double tmp= rows2double(info->param->table->quick_rows[scan->keynr]) /
+    double tmp= rows2double(info->param->quick_rows[scan->keynr]) /
                 rows2double(prev_records);
     DBUG_PRINT("info", ("Selectivity multiplier: %g", tmp));
     selectivity_mult *= tmp;
@@ -4536,7 +6192,7 @@ static bool ror_intersect_add(ROR_INTERSECT_INFO *info,
   }
   else
   {
-    info->index_records += info->param->table->quick_rows[ror_scan->keynr];
+    info->index_records += info->param->quick_rows[ror_scan->keynr];
     info->index_scan_costs += ror_scan->index_read_cost;
     bitmap_union(&info->covered_fields, &ror_scan->covered_fields);
     if (!info->is_covering && bitmap_is_subset(&info->param->needed_fields,
@@ -4646,7 +6302,6 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
   ROR_SCAN_INFO **cur_ror_scan;
   ROR_SCAN_INFO *cpk_scan= NULL;
   uint cpk_no;
-  bool cpk_scan_used= FALSE;
 
   if (!(tree->ror_scans= (ROR_SCAN_INFO**)alloc_root(param->mem_root,
                                                      sizeof(ROR_SCAN_INFO*)*
@@ -4658,11 +6313,20 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
   for (idx= 0, cur_ror_scan= tree->ror_scans; idx < param->keys; idx++)
   {
     ROR_SCAN_INFO *scan;
+    uint key_no;
     if (!tree->ror_scans_map.is_set(idx))
       continue;
+    key_no= param->real_keynr[idx];
+    if (key_no != cpk_no &&
+        param->table->file->index_flags(key_no,0,0) & HA_CLUSTERED_INDEX)
+    {
+      /* Ignore clustering keys */
+      tree->n_ror_scans--;
+      continue;
+    }
     if (!(scan= make_ror_scan(param, idx, tree->keys[idx])))
       return NULL;
-    if (param->real_keynr[idx] == cpk_no)
+    if (key_no == cpk_no)
     {
       cpk_scan= scan;
       tree->n_ror_scans--;
@@ -4748,15 +6412,14 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
   {
     if (ror_intersect_add(intersect, cpk_scan, TRUE) && 
         (intersect->total_cost < min_cost))
-    {
-      cpk_scan_used= TRUE;
       intersect_best= intersect; //just set pointer here
-    }
   }
+  else
+    cpk_scan= 0;                                // Don't use cpk_scan
 
   /* Ok, return ROR-intersect plan if we have found one */
   TRP_ROR_INTERSECT *trp= NULL;
-  if (min_cost < read_time && (cpk_scan_used || best_num > 1))
+  if (min_cost < read_time && (cpk_scan || best_num > 1))
   {
     if (!(trp= new (param->mem_root) TRP_ROR_INTERSECT))
       DBUG_RETURN(trp);
@@ -4775,7 +6438,7 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
     set_if_smaller(param->table->quick_condition_rows, best_rows);
     trp->records= best_rows;
     trp->index_scan_costs= intersect_best->index_scan_costs;
-    trp->cpk_scan= cpk_scan_used? cpk_scan: NULL;
+    trp->cpk_scan= cpk_scan;
     DBUG_PRINT("info", ("Returning non-covering ROR-intersect plan:"
                         "cost %g, records %lu",
                         trp->read_cost, (ulong) trp->records));
@@ -4787,7 +6450,7 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree,
 /*
   Get best covering ROR-intersection.
   SYNOPSIS
-    get_best_covering_ror_intersect()
+    get_best_ntersectcovering_ror_intersect()
       param     Parameter from test_quick_select function.
       tree      SEL_TREE with sets of intervals for different keys.
       read_time Don't return table read plans with cost > read_time.
@@ -4975,6 +6638,14 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
                                       "tree scans"););
   tree->ror_scans_map.clear_all();
   tree->n_ror_scans= 0;
+  tree->index_scans= 0;
+  if (!tree->keys_map.is_clear_all())
+  {
+    tree->index_scans=
+      (INDEX_SCAN_INFO **) alloc_root(param->mem_root,
+                                      sizeof(INDEX_SCAN_INFO *) * param->keys);
+  }
+  tree->index_scans_end= tree->index_scans;                                                  
   for (idx= 0,key=tree->keys, end=key+param->keys; key != end; key++,idx++)
   {
     if (*key)
@@ -4983,18 +6654,32 @@ static TRP_RANGE *get_key_scans_params(PARAM *param, SEL_TREE *tree,
       COST_VECT cost;
       double found_read_time;
       uint mrr_flags, buf_size;
+      INDEX_SCAN_INFO *index_scan;
       uint keynr= param->real_keynr[idx];
       if ((*key)->type == SEL_ARG::MAYBE_KEY ||
           (*key)->maybe_flag)
         param->needed_reg->set_bit(keynr);
 
-      bool read_index_only= index_read_must_be_used || 
-                            param->table->covering_keys.is_set(keynr);
+      bool read_index_only= index_read_must_be_used ? TRUE :
+                            (bool) param->table->covering_keys.is_set(keynr);
 
       found_records= check_quick_select(param, idx, read_index_only, *key,
                                         update_tbl_stats, &mrr_flags,
                                         &buf_size, &cost);
 
+      if (found_records != HA_POS_ERROR && tree->index_scans &&
+          (index_scan= (INDEX_SCAN_INFO *)alloc_root(param->mem_root,
+						     sizeof(INDEX_SCAN_INFO))))
+      {
+        index_scan->idx= idx;
+        index_scan->keynr= keynr;
+        index_scan->key_info= &param->table->key_info[keynr];
+        index_scan->used_key_parts= param->max_key_part+1;
+        index_scan->range_count= param->range_count;
+        index_scan->records= found_records;
+        index_scan->sel_arg= *key;
+        *tree->index_scans_end++= index_scan;
+      }        
       if ((found_records != HA_POS_ERROR) && param->is_ror_scan)
       {
         tree->n_ror_scans++;
@@ -5064,6 +6749,36 @@ QUICK_SELECT_I *TRP_INDEX_MERGE::make_quick(PARAM *param,
   return quick_imerge;
 }
 
+
+QUICK_SELECT_I *TRP_INDEX_INTERSECT::make_quick(PARAM *param,
+                                                bool retrieve_full_rows,
+                                                MEM_ROOT *parent_alloc)
+{
+  QUICK_INDEX_INTERSECT_SELECT *quick_intersect;
+  QUICK_RANGE_SELECT *quick;
+  /* index_merge always retrieves full rows, ignore retrieve_full_rows */
+  if (!(quick_intersect= new QUICK_INDEX_INTERSECT_SELECT(param->thd, param->table)))
+    return NULL;
+
+  quick_intersect->records= records;
+  quick_intersect->read_time= read_cost;
+  quick_intersect->filtered_scans= filtered_scans;
+  for (TRP_RANGE **range_scan= range_scans; range_scan != range_scans_end;
+       range_scan++)
+  {
+    if (!(quick= (QUICK_RANGE_SELECT*)
+          ((*range_scan)->make_quick(param, FALSE, &quick_intersect->alloc)))||
+        quick_intersect->push_quick_back(quick))
+    {
+      delete quick;
+      delete quick_intersect;
+      return NULL;
+    }
+  }
+  return quick_intersect;
+}
+
+
 QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
                                               bool retrieve_full_rows,
                                               MEM_ROOT *parent_alloc)
@@ -5089,7 +6804,7 @@ QUICK_SELECT_I *TRP_ROR_INTERSECT::make_quick(PARAM *param,
                                     (*first_scan)->sel_arg,
                                     HA_MRR_USE_DEFAULT_IMPL | HA_MRR_SORTED,
                                     0, alloc)) ||
-          quick_intrsect->push_quick_back(quick))
+          quick_intrsect->push_quick_back(alloc, quick))
       {
         delete quick_intrsect;
         DBUG_RETURN(NULL);
@@ -5513,11 +7228,10 @@ static SEL_TREE *get_full_func_mm_tree(RANGE_OPT_PARAM *param,
   Item_equal *item_equal= field_item->item_equal;
   if (item_equal)
   {
-    Item_equal_iterator it(*item_equal);
-    Item_field *item;
-    while ((item= it++))
+    Item_equal_fields_iterator it(*item_equal);
+    while (it++)
     {
-      Field *f= item->field;
+      Field *f= it.get_curr_field();
       if (field->eq(f))
         continue;
       if (!((ref_tables | f->table->map) & param_comp))
@@ -5666,13 +7380,13 @@ static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond)
   case Item_func::MULT_EQUAL_FUNC:
   {
     Item_equal *item_equal= (Item_equal *) cond;    
-    if (!(value= item_equal->get_const()))
+    if (!(value= item_equal->get_const()) || value->is_expensive())
       DBUG_RETURN(0);
-    Item_equal_iterator it(*item_equal);
+    Item_equal_fields_iterator it(*item_equal);
     ref_tables= value->used_tables();
-    while ((field_item= it++))
+    while (it++)
     {
-      Field *field= field_item->field;
+      Field *field= it.get_curr_field();
       Item_result cmp_type= field->cmp_type();
       if (!((ref_tables | field->table->map) & param_comp))
       {
@@ -5699,6 +7413,9 @@ static SEL_TREE *get_mm_tree(RANGE_OPT_PARAM *param,COND *cond)
     }
     else
       DBUG_RETURN(0);
+    if (value && value->is_expensive())
+      DBUG_RETURN(0);
+
     ftree= get_full_func_mm_tree(param, cond_func, field_item, value, inv);
   }
 
@@ -5747,6 +7464,7 @@ get_mm_parts(RANGE_OPT_PARAM *param, COND *cond_func, Field *field,
 	  DBUG_RETURN(0);			// OOM
       }
       sel_arg->part=(uchar) key_part->part;
+      sel_arg->max_part_no= sel_arg->part+1;
       tree->keys[key_part->key]=sel_add(tree->keys[key_part->key],sel_arg);
       tree->keys_map.set_bit(key_part->key);
     }
@@ -5767,7 +7485,6 @@ get_mm_leaf(RANGE_OPT_PARAM *param, COND *conf_func, Field *field,
   SEL_ARG *tree= 0;
   MEM_ROOT *alloc= param->mem_root;
   uchar *str;
-  ulonglong orig_sql_mode;
   int err;
   DBUG_ENTER("get_mm_leaf");
 
@@ -5937,16 +7654,8 @@ get_mm_leaf(RANGE_OPT_PARAM *param, COND *conf_func, Field *field,
     We can't always use indexes when comparing a string index to a number
     cmp_type() is checked to allow compare of dates to numbers
   */
-  if (field->result_type() == STRING_RESULT &&
-      value->result_type() != STRING_RESULT &&
-      field->cmp_type() != value->result_type())
+  if (field->cmp_type() == STRING_RESULT && value->cmp_type() != STRING_RESULT)
     goto end;
-  /* For comparison purposes allow invalid dates like 2000-01-32 */
-  orig_sql_mode= field->table->in_use->variables.sql_mode;
-  if (value->real_item()->type() == Item::STRING_ITEM &&
-      (field->type() == MYSQL_TYPE_DATE ||
-       field->type() == MYSQL_TYPE_DATETIME))
-    field->table->in_use->variables.sql_mode|= MODE_INVALID_DATES;
   err= value->save_in_field_no_warnings(field, 1);
   if (err > 0)
   {
@@ -5958,7 +7667,6 @@ get_mm_leaf(RANGE_OPT_PARAM *param, COND *conf_func, Field *field,
       {
         tree= new (alloc) SEL_ARG(field, 0, 0);
         tree->type= SEL_ARG::IMPOSSIBLE;
-        field->table->in_use->variables.sql_mode= orig_sql_mode;
         goto end;
       }
       else
@@ -5992,10 +7700,7 @@ get_mm_leaf(RANGE_OPT_PARAM *param, COND *conf_func, Field *field,
           */
         }
         else
-        {
-          field->table->in_use->variables.sql_mode= orig_sql_mode;
           goto end;
-        }
       }
     }
 
@@ -6018,12 +7723,10 @@ get_mm_leaf(RANGE_OPT_PARAM *param, COND *conf_func, Field *field,
   }
   else if (err < 0)
   {
-    field->table->in_use->variables.sql_mode= orig_sql_mode;
     /* This happens when we try to insert a NULL field in a not null column */
     tree= &null_element;                        // cmp with NULL is never TRUE
     goto end;
   }
-  field->table->in_use->variables.sql_mode= orig_sql_mode;
 
   /*
     Any sargable predicate except "<=>" involving NULL as a constant is always
@@ -6198,13 +7901,138 @@ sel_add(SEL_ARG *key1,SEL_ARG *key2)
   return root;
 }
 
-#define CLONE_KEY1_MAYBE 1
-#define CLONE_KEY2_MAYBE 2
-#define swap_clone_flag(A) ((A & 1) << 1) | ((A & 2) >> 1)
 
+/* 
+  Build a range tree for the conjunction of the range parts of two trees
+
+  SYNOPSIS
+    and_range_trees()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first conjunct          
+      tree2           SEL_TREE for the second conjunct
+      result          SEL_TREE for the result
 
-static SEL_TREE *
-tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
+  DESCRIPTION
+    This function takes range parts of two trees tree1 and tree2 and builds
+    a range tree for the conjunction of the formulas that these two range parts
+    represent.
+    More exactly: 
+    if the range part of tree1 represents the normalized formula 
+      R1_1 AND ... AND R1_k,
+    and the range part of tree2 represents the normalized formula
+      R2_1 AND ... AND R2_k,
+    then the range part of the result represents the formula:
+     RT = R_1 AND ... AND R_k, where R_i=(R1_i AND R2_i) for each i from [1..k]
+
+    The function assumes that tree1 is never equal to tree2. At the same
+    time the tree result can be the same as tree1 (but never as tree2).
+    If result==tree1 then rt replaces the range part of tree1 leaving
+    imerges as they are.
+    if result!=tree1 than it is assumed that the SEL_ARG trees in tree1 and
+    tree2 should be preserved. Otherwise they can be destroyed.
+
+  RETURN 
+    1    if the type the result tree is  SEL_TREE::IMPOSSIBLE
+    0    otherwise    
+*/
+
+static
+int and_range_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2,
+                    SEL_TREE *result)
+{
+  DBUG_ENTER("and_ranges");
+  key_map  result_keys;
+  result_keys.clear_all();
+  key_map anded_keys= tree1->keys_map;
+  anded_keys.merge(tree2->keys_map);
+  int key_no;
+  key_map::Iterator it(anded_keys);
+  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
+  {
+    uint flag=0;
+    SEL_ARG *key1= tree1->keys[key_no];
+    SEL_ARG *key2= tree2->keys[key_no];
+    if (key1 && !key1->simple_key())
+      flag|= CLONE_KEY1_MAYBE;
+    if (key2 && !key2->simple_key())
+      flag|=CLONE_KEY2_MAYBE;
+    if (result != tree1)
+    { 
+      if (key1)
+        key1->incr_refs();
+      if (key2)
+        key2->incr_refs();
+    }
+    SEL_ARG *key;
+    if ((result->keys[key_no]= key =key_and(param, key1, key2, flag)))
+    {
+      if (key && key->type == SEL_ARG::IMPOSSIBLE)
+      {
+	result->type= SEL_TREE::IMPOSSIBLE;
+        DBUG_RETURN(1);
+      }
+      result_keys.set_bit(key_no);
+#ifdef EXTRA_DEBUG
+      if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
+        key->test_use_count(key);
+#endif
+    }
+  }
+  result->keys_map= result_keys;
+  DBUG_RETURN(0);
+}
+  
+
+/*
+  Build a SEL_TREE for a conjunction out of such trees for the conjuncts
+
+  SYNOPSIS
+    tree_and()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first conjunct          
+      tree2           SEL_TREE for the second conjunct
+
+  DESCRIPTION
+    This function builds a tree for the formula (A AND B) out of the trees
+    tree1 and tree2 that has been built for the formulas A and B respectively.
+
+    In a general case
+      tree1 represents the formula RT1 AND MT1,
+        where RT1 = R1_1 AND ... AND R1_k1, MT1=M1_1 AND ... AND M1_l1;
+      tree2 represents the formula RT2 AND MT2 
+        where RT2 = R2_1 AND ... AND R2_k2, MT2=M2_1 and ... and M2_l2.
+
+    The result tree will represent the formula of the the following structure:
+      RT AND MT1 AND MT2 AND RT1MT2 AND RT2MT1, such that
+        rt is a tree obtained by range intersection of trees tree1 and tree2,
+        RT1MT2 = RT1M2_1 AND ... AND RT1M2_l2,
+        RT2MT1 = RT2M1_1 AND ... AND RT2M1_l1,
+        where rt1m2_i (i=1,...,l2) is the result of the pushdown operation
+        of range tree rt1 into imerge m2_i, while rt2m1_j (j=1,...,l1) is the
+        result of the pushdown operation of range tree rt2 into imerge m1_j.
+
+    RT1MT2/RT2MT is empty if MT2/MT1 is empty.
+ 
+    The range intersection of two range trees is produced by the function
+    and_range_trees. The pushdown of a range tree to a imerge is performed
+    by the function imerge_list_and_tree. This function may produce imerges
+    containing only one range tree. Such trees are intersected with rt and 
+    the result of intersection is returned as the range part of the result
+    tree, while the corresponding imerges are removed altogether from its
+    imerge part. 
+    
+  NOTE.
+    The pushdown operation of range trees into imerges is needed to be able
+    to construct valid imerges for the condition like this:
+      key1_p1=c1 AND (key1_p2 BETWEEN c21 AND c22 OR key2 < c2)
+
+  RETURN
+    The result tree, if a success
+    0 - otherwise.        
+*/
+
+static 
+SEL_TREE *tree_and(RANGE_OPT_PARAM *param, SEL_TREE *tree1, SEL_TREE *tree2)
 {
   DBUG_ENTER("tree_and");
   if (!tree1)
@@ -6226,87 +8054,216 @@ tree_and(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
     tree1->type=SEL_TREE::KEY_SMALLER;
     DBUG_RETURN(tree1);
   }
-  key_map  result_keys;
-  result_keys.clear_all();
-  
-  /* Join the trees key per key */
-  SEL_ARG **key1,**key2,**end;
-  for (key1= tree1->keys,key2= tree2->keys,end=key1+param->keys ;
-       key1 != end ; key1++,key2++)
+
+  if (!tree1->merges.is_empty())
+    imerge_list_and_tree(param, &tree1->merges, tree2);
+  if (!tree2->merges.is_empty())
+    imerge_list_and_tree(param, &tree2->merges, tree1);
+  if (and_range_trees(param, tree1, tree2, tree1))
+    DBUG_RETURN(tree1);
+  imerge_list_and_list(&tree1->merges, &tree2->merges);
+  eliminate_single_tree_imerges(param, tree1);
+  DBUG_RETURN(tree1);
+}
+
+
+/*
+  Eliminate single tree imerges in a SEL_TREE objects
+
+  SYNOPSIS
+    eliminate_single_tree_imerges()
+      param      Context info for the function
+      tree       SEL_TREE where single tree imerges are to be eliminated 
+
+  DESCRIPTION
+    For each imerge in 'tree' that contains only one disjunct tree, i.e.
+    for any imerge of the form m=rt, the function performs and operation
+    the range part of tree, replaces rt the with the result of anding and
+    removes imerge m from the the merge part of 'tree'.
+
+  RETURN VALUE
+    none          
+*/
+
+static
+void eliminate_single_tree_imerges(RANGE_OPT_PARAM *param, SEL_TREE *tree)
+{
+  SEL_IMERGE *imerge;
+  List<SEL_IMERGE> merges= tree->merges;
+  List_iterator<SEL_IMERGE> it(merges);
+  tree->merges.empty();
+  while ((imerge= it++))
   {
-    uint flag=0;
-    if (*key1 || *key2)
-    {
-      if (*key1 && !(*key1)->simple_key())
-	flag|=CLONE_KEY1_MAYBE;
-      if (*key2 && !(*key2)->simple_key())
-	flag|=CLONE_KEY2_MAYBE;
-      *key1=key_and(param, *key1, *key2, flag);
-      if (*key1 && (*key1)->type == SEL_ARG::IMPOSSIBLE)
-      {
-	tree1->type= SEL_TREE::IMPOSSIBLE;
-        DBUG_RETURN(tree1);
-      }
-      result_keys.set_bit(key1 - tree1->keys);
-#ifdef EXTRA_DEBUG
-        if (*key1 && param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
-          (*key1)->test_use_count(*key1);
-#endif
+    if (imerge->trees+1 == imerge->trees_next)
+    {
+      tree= tree_and(param, tree, *imerge->trees);
+      it.remove();
     }
   }
-  tree1->keys_map= result_keys;
-  /* dispose index_merge if there is a "range" option */
-  if (!result_keys.is_clear_all())
-  {
-    tree1->merges.empty();
-    DBUG_RETURN(tree1);
-  }
+  tree->merges= merges;
+} 
 
-  /* ok, both trees are index_merge trees */
-  imerge_list_and_list(&tree1->merges, &tree2->merges);
-  DBUG_RETURN(tree1);
+
+/*
+  For two trees check that there are indexes with ranges in both of them  
+ 
+  SYNOPSIS
+    sel_trees_have_common_keys()
+      tree1           SEL_TREE for the first tree
+      tree2           SEL_TREE for the second tree
+      common_keys OUT bitmap of all indexes with ranges in both trees
+
+  DESCRIPTION
+    For two trees tree1 and tree1 the function checks if there are indexes
+    in their range parts such that SEL_ARG trees are defined for them in the
+    range parts of both trees. The function returns the bitmap of such 
+    indexes in the parameter common_keys.
+
+  RETURN 
+    TRUE    if there are such indexes (common_keys is nor empty)
+    FALSE   otherwise
+*/
+
+static
+bool sel_trees_have_common_keys(SEL_TREE *tree1, SEL_TREE *tree2, 
+                                key_map *common_keys)
+{
+  *common_keys= tree1->keys_map;
+  common_keys->intersect(tree2->keys_map);
+  return !common_keys->is_clear_all();
 }
 
 
 /*
-  Check if two SEL_TREES can be combined into one (i.e. a single key range
-  read can be constructed for "cond_of_tree1 OR cond_of_tree2" ) without
-  using index_merge.
+  Check whether range parts of two trees can be ored for some indexes
+
+  SYNOPSIS
+    sel_trees_can_be_ored()
+      param              Context info for the function
+      tree1              SEL_TREE for the first tree
+      tree2              SEL_TREE for the second tree
+      common_keys IN/OUT IN: bitmap of all indexes with SEL_ARG in both trees
+                        OUT: bitmap of all indexes that can be ored
+
+  DESCRIPTION
+    For two trees tree1 and tree2 and the bitmap common_keys containing
+    bits for indexes that have SEL_ARG trees in range parts of both trees
+    the function checks if there are indexes for which SEL_ARG trees can
+    be ored. Two SEL_ARG trees for the same index can be ored if the most
+    major components of the index used in these trees coincide. If the 
+    SEL_ARG trees for an index cannot be ored the function clears the bit
+    for this index in the bitmap common_keys.
+
+    The function does not verify that indexes marked in common_keys really
+    have SEL_ARG trees in both tree1 and tree2. It assumes that this is true.
+
+  NOTE
+    The function sel_trees_can_be_ored is usually used in pair with the
+    function sel_trees_have_common_keys.
+
+  RETURN
+    TRUE    if there are indexes for which SEL_ARG trees can be ored 
+    FALSE   otherwise
 */
 
-bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2, 
-                           RANGE_OPT_PARAM* param)
+static
+bool sel_trees_can_be_ored(RANGE_OPT_PARAM* param,
+                           SEL_TREE *tree1, SEL_TREE *tree2, 
+                           key_map *common_keys)
 {
-  key_map common_keys= tree1->keys_map;
   DBUG_ENTER("sel_trees_can_be_ored");
-  common_keys.intersect(tree2->keys_map);
+  if (!sel_trees_have_common_keys(tree1, tree2, common_keys))
+    DBUG_RETURN(FALSE);
+  int key_no;
+  key_map::Iterator it(*common_keys);
+  while ((key_no= it++) != key_map::Iterator::BITMAP_END)
+  {
+    DBUG_ASSERT(tree1->keys[key_no] && tree2->keys[key_no]);
+    /* Trees have a common key, check if they refer to the same key part */
+    if (tree1->keys[key_no]->part != tree2->keys[key_no]->part)
+      common_keys->clear_bit(key_no);
+  }
+  DBUG_RETURN(!common_keys->is_clear_all());
+}
+
+/*
+  Check whether range parts of two trees must be ored for some indexes
+
+  SYNOPSIS
+    sel_trees_must_be_ored()
+      param              Context info for the function
+      tree1              SEL_TREE for the first tree
+      tree2              SEL_TREE for the second tree
+      ordable_keys       bitmap of SEL_ARG trees that can be ored
+
+  DESCRIPTION
+    For two trees tree1 and tree2 the function checks whether they must be
+    ored. The function assumes that the bitmap ordable_keys contains bits for
+    those corresponding pairs of SEL_ARG trees from tree1 and tree2 that can
+    be ored.
+    We believe that tree1 and tree2 must be ored if any pair of SEL_ARG trees
+    r1 and r2, such that r1 is from tree1 and r2 is from tree2 and both
+    of them are marked in ordable_keys, can be merged.
+    
+  NOTE
+    The function sel_trees_must_be_ored as a rule is used in pair with the
+    function sel_trees_can_be_ored.
+
+  RETURN
+    TRUE    if there are indexes for which SEL_ARG trees must be ored 
+    FALSE   otherwise
+*/
+
+static
+bool sel_trees_must_be_ored(RANGE_OPT_PARAM* param,
+                            SEL_TREE *tree1, SEL_TREE *tree2,
+                            key_map oredable_keys)
+{
+  key_map tmp;
+  DBUG_ENTER("sel_trees_must_be_ored");
 
-  if (common_keys.is_clear_all())
+  tmp= tree1->keys_map;
+  tmp.merge(tree2->keys_map);
+  tmp.subtract(oredable_keys);
+  if (!tmp.is_clear_all())
     DBUG_RETURN(FALSE);
 
-  /* trees have a common key, check if they refer to same key part */
-  SEL_ARG **key1,**key2;
-  for (uint key_no=0; key_no < param->keys; key_no++)
+  int idx1, idx2;
+  key_map::Iterator it1(oredable_keys);
+  while ((idx1= it1++) != key_map::Iterator::BITMAP_END)
   {
-    if (common_keys.is_set(key_no))
+    KEY_PART *key1_init= param->key[idx1]+tree1->keys[idx1]->part;
+    KEY_PART *key1_end= param->key[idx1]+tree1->keys[idx1]->max_part_no;
+    key_map::Iterator it2(oredable_keys);
+    while ((idx2= it2++) != key_map::Iterator::BITMAP_END)
     {
-      key1= tree1->keys + key_no;
-      key2= tree2->keys + key_no;
-      if ((*key1)->part == (*key2)->part)
-      {
-        DBUG_RETURN(TRUE);
+      if (idx2 <= idx1)
+        continue;
+      
+      KEY_PART *key2_init= param->key[idx2]+tree2->keys[idx2]->part;
+      KEY_PART *key2_end= param->key[idx2]+tree2->keys[idx2]->max_part_no;
+      KEY_PART *part1, *part2;
+      for (part1= key1_init, part2= key2_init;
+           part1 < key1_end && part2 < key2_end;
+           part1++, part2++)
+      { 
+        if (!part1->field->eq(part2->field))
+          DBUG_RETURN(FALSE);
       }
     }
   }
-  DBUG_RETURN(FALSE);
-}
+      
+  DBUG_RETURN(TRUE);
+}  
 
 
 /*
-  Remove the trees that are not suitable for record retrieval.
+  Remove the trees that are not suitable for record retrieval
+
   SYNOPSIS
-    param  Range analysis parameter
-    tree   Tree to be processed, tree->type is KEY or KEY_SMALLER
+    remove_nonrange_trees()
+      param  Context info for the function
+      tree   Tree to be processed, tree->type is KEY or KEY_SMALLER
  
   DESCRIPTION
     This function walks through tree->keys[] and removes the SEL_ARG* trees
@@ -6317,41 +8274,36 @@ bool sel_trees_can_be_ored(SEL_TREE *tree1, SEL_TREE *tree2,
 
     A SEL_ARG* tree cannot be used to construct quick select if it has
     tree->part != 0. (e.g. it could represent "keypart2 < const").
-
-    WHY THIS FUNCTION IS NEEDED
     
     Normally we allow construction of SEL_TREE objects that have SEL_ARG
-    trees that do not allow quick range select construction. For example for
-    " keypart1=1 AND keypart2=2 " the execution will proceed as follows:
+    trees that do not allow quick range select construction.
+    For example:
+    for " keypart1=1 AND keypart2=2 " the execution will proceed as follows:
     tree1= SEL_TREE { SEL_ARG{keypart1=1} }
     tree2= SEL_TREE { SEL_ARG{keypart2=2} } -- can't make quick range select
                                                from this
     call tree_and(tree1, tree2) -- this joins SEL_ARGs into a usable SEL_ARG
                                    tree.
-    
-    There is an exception though: when we construct index_merge SEL_TREE,
-    any SEL_ARG* tree that cannot be used to construct quick range select can
-    be removed, because current range analysis code doesn't provide any way
-    that tree could be later combined with another tree.
-    Consider an example: we should not construct
-    st1 = SEL_TREE { 
-      merges = SEL_IMERGE { 
-                            SEL_TREE(t.key1part1 = 1), 
-                            SEL_TREE(t.key2part2 = 2)   -- (*)
-                          } 
-                   };
-    because 
-     - (*) cannot be used to construct quick range select, 
-     - There is no execution path that would cause (*) to be converted to 
-       a tree that could be used.
-
-    The latter is easy to verify: first, notice that the only way to convert
-    (*) into a usable tree is to call tree_and(something, (*)).
-
-    Second look at what tree_and/tree_or function would do when passed a
-    SEL_TREE that has the structure like st1 tree has, and conlcude that 
-    tree_and(something, (*)) will not be called.
 
+    Another example:
+    tree3= SEL_TREE { SEL_ARG{key1part1 = 1} }
+    tree4= SEL_TREE { SEL_ARG{key2part2 = 2} }  -- can't make quick range select
+                                               from this
+    call tree_or(tree3, tree4) -- creates a SEL_MERGE ot of which no index
+    merge can be constructed, but it is potentially useful, as anding it with
+    tree5= SEL_TREE { SEL_ARG{key2part1 = 3} } creates an index merge that
+    represents the formula
+      key1part1=1 AND key2part1=3 OR key2part1=3 AND key2part2=2 
+    for which an index merge can be built. 
+
+    Any final SEL_TREE may contain SEL_ARG trees for which no quick select
+    can be built. Such SEL_ARG trees should be removed from the range part
+    before different range scans are evaluated. Such SEL_ARG trees also should
+    be removed from all range trees of each index merge before different
+    possible index merge plans are evaluated. If after this removal one
+    of the range trees in the index merge becomes empty the whole index merge
+    must be discarded.
+       
   RETURN
     0  Ok, some suitable trees left
     1  No tree->keys[] left.
@@ -6377,6 +8329,74 @@ static bool remove_nonrange_trees(RANGE_OPT_PARAM *param, SEL_TREE *tree)
 }
 
 
+/*
+  Build a SEL_TREE for a disjunction out of such trees for the disjuncts
+
+  SYNOPSIS
+    tree_or()
+      param           Context info for the operation
+      tree1           SEL_TREE for the first disjunct          
+      tree2           SEL_TREE for the second disjunct
+
+  DESCRIPTION
+    This function builds a tree for the formula (A OR B) out of the trees
+    tree1 and tree2 that has been built for the formulas A and B respectively.
+
+    In a general case
+      tree1 represents the formula RT1 AND MT1,
+        where RT1=R1_1 AND ... AND R1_k1, MT1=M1_1 AND ... AND M1_l1;
+      tree2 represents the formula RT2 AND MT2 
+        where RT2=R2_1 AND ... AND R2_k2, MT2=M2_1 and ... and M2_l2.
+
+    The function constructs the result tree according the formula
+      (RT1 OR RT2) AND (MT1 OR RT1) AND (MT2 OR RT2) AND (MT1 OR MT2)
+    that is equivalent to the formula (RT1 AND MT1) OR (RT2 AND MT2).
+
+    To limit the number of produced imerges the function considers
+    a weaker formula than the original one:
+      (RT1 AND M1_1) OR (RT2 AND M2_1) 
+    that is equivalent to:
+      (RT1 OR RT2)                  (1)
+        AND 
+      (M1_1 OR M2_1)                (2)
+        AND
+      (M1_1 OR RT2)                 (3)
+        AND
+      (M2_1 OR RT1)                 (4)
+
+    For the first conjunct (1) the function builds a tree with a range part
+    and, possibly, one imerge. For the other conjuncts (2-4)the function
+    produces sets of imerges. All constructed imerges are included into the
+    result tree.
+    
+    For the formula (1) the function produces the tree representing a formula  
+    of the structure RT [AND M], such that:
+     - the range tree rt contains the result of oring SEL_ARG trees from rt1
+       and rt2
+     - the imerge m consists of two range trees rt1 and rt2.
+    The imerge m is added if it's not true that rt1 and rt2 must be ored
+    If rt1 and rt2 can't be ored rt is empty and only m is produced for (1).
+
+    To produce imerges for the formula (2) the function calls the function
+    imerge_list_or_list passing it the merge parts of tree1 and tree2 as
+    parameters.
+
+    To produce imerges for the formula (3) the function calls the function
+    imerge_list_or_tree passing it the imerge m1_1 and the range tree rt2 as
+    parameters. Similarly, to produce imerges for the formula (4) the function
+    calls the function imerge_list_or_tree passing it the imerge m2_1 and the
+    range tree rt1.
+
+    If rt1 is empty then the trees for (1) and (4) are empty.
+    If rt2 is empty then the trees for (1) and (3) are empty.
+    If mt1 is empty then the trees for (2) and (3) are empty.
+    If mt2 is empty then the trees for (2) and (4) are empty.
+
+  RETURN
+    The result tree for the operation if a success
+    0 - otherwise
+*/
+
 static SEL_TREE *
 tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
 {
@@ -6392,74 +8412,100 @@ tree_or(RANGE_OPT_PARAM *param,SEL_TREE *tree1,SEL_TREE *tree2)
   if (tree2->type == SEL_TREE::MAYBE)
     DBUG_RETURN(tree2);
 
-  SEL_TREE *result= 0;
-  key_map  result_keys;
-  result_keys.clear_all();
-  if (sel_trees_can_be_ored(tree1, tree2, param))
+  SEL_TREE *result= NULL;
+  key_map result_keys;
+  key_map ored_keys;
+  SEL_TREE *rtree[2]= {NULL,NULL};
+  SEL_IMERGE *imerge[2]= {NULL, NULL};
+  bool no_ranges1= tree1->without_ranges();
+  bool no_ranges2= tree2->without_ranges();
+  bool no_merges1= tree1->without_imerges();
+  bool no_merges2= tree2->without_imerges();
+  if (!no_ranges1 && !no_merges2)
   {
-    /* Join the trees key per key */
-    SEL_ARG **key1,**key2,**end;
-    for (key1= tree1->keys,key2= tree2->keys,end= key1+param->keys ;
-         key1 != end ; key1++,key2++)
-    {
-      *key1=key_or(param, *key1, *key2);
-      if (*key1)
-      {
-        result=tree1;				// Added to tree1
-        result_keys.set_bit(key1 - tree1->keys);
-#ifdef EXTRA_DEBUG
-        if (param->alloced_sel_args < SEL_ARG::MAX_SEL_ARGS) 
-          (*key1)->test_use_count(*key1);
-#endif
-      }
-    }
-    if (result)
-      result->keys_map= result_keys;
+    rtree[0]= new SEL_TREE(tree1, TRUE, param);
+    imerge[1]= new SEL_IMERGE(tree2->merges.head(), 0, param);
   }
-  else
+  if (!no_ranges2 && !no_merges1)
   {
-    /* ok, two trees have KEY type but cannot be used without index merge */
-    if (tree1->merges.is_empty() && tree2->merges.is_empty())
+    rtree[1]= new SEL_TREE(tree2, TRUE, param);
+    imerge[0]= new SEL_IMERGE(tree1->merges.head(), 0, param);
+  }
+  bool no_imerge_from_ranges= FALSE;
+  if (!(result= new SEL_TREE()))
+    DBUG_RETURN(result);
+
+  /* Build the range part of the tree for the formula (1) */ 
+  if (sel_trees_can_be_ored(param, tree1, tree2, &ored_keys))
+  {
+    bool must_be_ored= sel_trees_must_be_ored(param, tree1, tree2, ored_keys);
+    no_imerge_from_ranges= must_be_ored;
+    key_map::Iterator it(ored_keys);
+    int key_no;
+    while ((key_no= it++) != key_map::Iterator::BITMAP_END)
     {
-      if (param->remove_jump_scans)
+      SEL_ARG *key1= tree1->keys[key_no];
+      SEL_ARG *key2= tree2->keys[key_no];
+      if (!must_be_ored)
       {
-        bool no_trees= remove_nonrange_trees(param, tree1);
-        no_trees= no_trees || remove_nonrange_trees(param, tree2);
-        if (no_trees)
-          DBUG_RETURN(new SEL_TREE(SEL_TREE::ALWAYS));
+        key1->incr_refs();
+        key2->incr_refs();
       }
-      SEL_IMERGE *merge;
-      /* both trees are "range" trees, produce new index merge structure */
-      if (!(result= new SEL_TREE()) || !(merge= new SEL_IMERGE()) ||
-          (result->merges.push_back(merge)) ||
-          (merge->or_sel_tree(param, tree1)) ||
-          (merge->or_sel_tree(param, tree2)))
-        result= NULL;
-      else
-        result->type= tree1->type;
+      if ((result->keys[key_no]= key_or(param, key1, key2)))
+        result->keys_map.set_bit(key_no);
     }
-    else if (!tree1->merges.is_empty() && !tree2->merges.is_empty())
-    {
-      if (imerge_list_or_list(param, &tree1->merges, &tree2->merges))
-        result= new SEL_TREE(SEL_TREE::ALWAYS);
-      else
-        result= tree1;
-    }
-    else
-    {
-      /* one tree is index merge tree and another is range tree */
-      if (tree1->merges.is_empty())
-        swap_variables(SEL_TREE*, tree1, tree2);
+    result->type= tree1->type;
+  }
       
-      if (param->remove_jump_scans && remove_nonrange_trees(param, tree2))
-         DBUG_RETURN(new SEL_TREE(SEL_TREE::ALWAYS));
-      /* add tree2 to tree1->merges, checking if it collapses to ALWAYS */
-      if (imerge_list_or_tree(param, &tree1->merges, tree2))
-        result= new SEL_TREE(SEL_TREE::ALWAYS);
-      else
-        result= tree1;
-    }
+  if (no_imerge_from_ranges && no_merges1 && no_merges2)
+  {
+    if (result->keys_map.is_clear_all())
+      result->type= SEL_TREE::ALWAYS;
+    DBUG_RETURN(result);
   }
+
+  SEL_IMERGE *imerge_from_ranges;
+  if (!(imerge_from_ranges= new SEL_IMERGE()))
+    result= NULL;
+  else if (!no_ranges1 && !no_ranges2 && !no_imerge_from_ranges)
+  {
+    /* Build the imerge part of the tree for the formula (1) */
+    SEL_TREE *rt1= tree1;
+    SEL_TREE *rt2= tree2;
+    if (!no_merges1)
+      rt1= new SEL_TREE(tree1, TRUE, param);
+    if (!no_merges2)
+      rt2= new SEL_TREE(tree2, TRUE, param);
+    if (!rt1 || !rt2 ||
+        result->merges.push_back(imerge_from_ranges) ||
+        imerge_from_ranges->or_sel_tree(param, rt1) ||
+        imerge_from_ranges->or_sel_tree(param, rt2))
+      result= NULL;
+  }
+  if (!result)
+    DBUG_RETURN(result);
+
+  result->type= tree1->type;
+
+  if (!no_merges1 && !no_merges2 && 
+      !imerge_list_or_list(param, &tree1->merges, &tree2->merges))
+  {
+    /* Build the imerges for the formula (2) */
+    imerge_list_and_list(&result->merges, &tree1->merges);
+  }
+
+  /* Build the imerges for the formulas (3) and (4) */
+  for (uint i=0; i < 2; i++)
+  {
+    List<SEL_IMERGE> merges;
+    SEL_TREE *rt= rtree[i];
+    SEL_IMERGE *im= imerge[1-i];
+    
+    if (rt && im && !merges.push_back(im) && 
+        !imerge_list_or_tree(param, &merges, rt))
+      imerge_list_and_list(&result->merges, &merges);
+  }
+ 
   DBUG_RETURN(result);
 }
 
@@ -6505,6 +8551,7 @@ and_all_keys(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2,
   if (!key1)
     return &null_element;			// Impossible ranges
   key1->use_count++;
+  key1->max_part_no= max(key2->max_part_no, key2->part+1);
   return key1;
 }
 
@@ -6597,6 +8644,7 @@ key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2, uint clone_flag)
   key1->use_count--;
   key2->use_count--;
   SEL_ARG *e1=key1->first(), *e2=key2->first(), *new_tree=0;
+  uint max_part_no= max(key1->max_part_no, key2->max_part_no);
 
   while (e1 && e2)
   {
@@ -6634,6 +8682,7 @@ key_and(RANGE_OPT_PARAM *param, SEL_ARG *key1, SEL_ARG *key2, uint clone_flag)
   key2->free_tree();
   if (!new_tree)
     return &null_element;			// Impossible range
+  new_tree->max_part_no= max_part_no;
   return new_tree;
 }
 
@@ -6739,7 +8788,7 @@ key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1,SEL_ARG *key2)
   {
     key1->free_tree();
     key2->free_tree();
-    return 0;					// Can't optimize this
+    return 0;                                   // Can't optimize this
   }
 
   // If one of the key is MAYBE_KEY then the found region may be bigger
@@ -6762,247 +8811,548 @@ key_or(RANGE_OPT_PARAM *param, SEL_ARG *key1,SEL_ARG *key2)
     {
       swap_variables(SEL_ARG *,key1,key2);
     }
-    if (key1->use_count > 0 || !(key1=key1->clone_tree(param)))
-      return 0;					// OOM
+    if (key1->use_count > 0 && !(key1=key1->clone_tree(param)))
+      return 0;                                 // OOM
   }
 
   // Add tree at key2 to tree at key1
   bool key2_shared=key2->use_count != 0;
   key1->maybe_flag|=key2->maybe_flag;
 
+  /*
+    Notation for illustrations used in the rest of this function: 
+
+      Range: [--------]
+             ^        ^
+             start    stop
+
+      Two overlapping ranges:
+        [-----]               [----]            [--]
+            [---]     or    [---]       or   [-------]
+
+      Ambiguity: *** 
+        The range starts or stops somewhere in the "***" range.
+        Example: a starts before b and may end before/the same plase/after b
+        a: [----***]
+        b:   [---]
+
+      Adjacent ranges:
+        Ranges that meet but do not overlap. Example: a = "x < 3", b = "x >= 3"
+        a: ----]
+        b:      [----
+   */
+
+  uint max_part_no= max(key1->max_part_no, key2->max_part_no);
+
   for (key2=key2->first(); key2; )
   {
-    SEL_ARG *tmp=key1->find_range(key2);	// Find key1.min <= key2.min
-    int cmp;
+    /*
+      key1 consists of one or more ranges. tmp is the range currently
+      being handled.
+
+      initialize tmp to the latest range in key1 that starts the same
+      place or before the range in key2 starts
+
+      key2:           [------]
+      key1: [---] [-----] [----]
+                  ^
+                  tmp
+    */
+    SEL_ARG *tmp=key1->find_range(key2);
+
+    /*
+      Used to describe how two key values are positioned compared to
+      each other. Consider key_value_a.<cmp_func>(key_value_b):
+
+        -2: key_value_a is smaller than key_value_b, and they are adjacent
+        -1: key_value_a is smaller than key_value_b (not adjacent)
+         0: the key values are equal
+         1: key_value_a is bigger than key_value_b (not adjacent)
+        -2: key_value_a is bigger than key_value_b, and they are adjacent
+
+      Example: "cmp= tmp->cmp_max_to_min(key2)"
+
+      key2:         [--------            (10 <= x ...)
+      tmp:    -----]                      (... x <  10) => cmp==-2
+      tmp:    ----]                       (... x <=  9) => cmp==-1
+      tmp:    ------]                     (... x  = 10) => cmp== 0
+      tmp:    --------]                   (... x <= 12) => cmp== 1
+      (cmp == 2 does not make sense for cmp_max_to_min())
+     */
+    int cmp= 0;
 
     if (!tmp)
     {
-      tmp=key1->first();			// tmp.min > key2.min
+      /*
+        The range in key2 starts before the first range in key1. Use
+        the first range in key1 as tmp.
+
+        key2:     [--------]
+        key1:            [****--] [----]   [-------]
+                         ^
+                         tmp
+      */
+      tmp=key1->first();
       cmp= -1;
     }
-    else if ((cmp=tmp->cmp_max_to_min(key2)) < 0)
-    {						// Found tmp.max < key2.min
+    else if ((cmp= tmp->cmp_max_to_min(key2)) < 0)
+    {
+      /*
+        This is the case:
+        key2:          [-------]
+        tmp:   [----**]
+       */
       SEL_ARG *next=tmp->next;
-      /* key1 on the left of key2 non-overlapping */
       if (cmp == -2 && eq_tree(tmp->next_key_part,key2->next_key_part))
       {
-	// Join near ranges like tmp.max < 0 and key2.min >= 0
-	SEL_ARG *key2_next=key2->next;
-	if (key2_shared)
-	{
-	  if (!(key2=new SEL_ARG(*key2)))
-	    return 0;		// out of memory
-	  key2->increment_use_count(key1->use_count+1);
-	  key2->next=key2_next;			// New copy of key2
-	}
-	key2->copy_min(tmp);
-	if (!(key1=key1->tree_delete(tmp)))
-	{					// Only one key in tree
-	  key1=key2;
-	  key1->make_root();
-	  key2=key2_next;
-	  break;
-	}
+        /*
+          Adjacent (cmp==-2) and equal next_key_parts => ranges can be merged
+
+          This is the case:
+          key2:          [-------]
+          tmp:     [----]
+
+          Result:
+          key2:    [-------------]     => inserted into key1 below
+          tmp:                         => deleted
+        */
+        SEL_ARG *key2_next=key2->next;
+        if (key2_shared)
+        {
+          if (!(key2=new SEL_ARG(*key2)))
+            return 0;           // out of memory
+          key2->increment_use_count(key1->use_count+1);
+          key2->next=key2_next;                 // New copy of key2
+        }
+
+        key2->copy_min(tmp);
+        if (!(key1=key1->tree_delete(tmp)))
+        {                                       // Only one key in tree
+          key1=key2;
+          key1->make_root();
+          key2=key2_next;
+          break;
+        }
       }
-      if (!(tmp=next))				// tmp.min > key2.min
-	break;					// Copy rest of key2
+      if (!(tmp=next)) // Move to next range in key1. Now tmp.min > key2.min
+        break;         // No more ranges in key1. Copy rest of key2
     }
+
     if (cmp < 0)
-    {						// tmp.min > key2.min
+    {
+      /*
+        This is the case:
+        key2:  [--***]
+        tmp:       [----]
+      */
       int tmp_cmp;
-      if ((tmp_cmp=tmp->cmp_min_to_max(key2)) > 0) // if tmp.min > key2.max
+      if ((tmp_cmp=tmp->cmp_min_to_max(key2)) > 0)
       {
-        /* tmp is on the right of key2 non-overlapping */
-	if (tmp_cmp == 2 && eq_tree(tmp->next_key_part,key2->next_key_part))
-	{					// ranges are connected
-	  tmp->copy_min_to_min(key2);
-	  key1->merge_flags(key2);
-	  if (tmp->min_flag & NO_MIN_RANGE &&
-	      tmp->max_flag & NO_MAX_RANGE)
-	  {
-	    if (key1->maybe_flag)
-	      return new SEL_ARG(SEL_ARG::MAYBE_KEY);
-	    return 0;
-	  }
-	  key2->increment_use_count(-1);	// Free not used tree
-	  key2=key2->next;
-	  continue;
-	}
-	else
-	{
-	  SEL_ARG *next=key2->next;		// Keys are not overlapping
-	  if (key2_shared)
-	  {
-	    SEL_ARG *cpy= new SEL_ARG(*key2);	// Must make copy
-	    if (!cpy)
-	      return 0;				// OOM
-	    key1=key1->insert(cpy);
-	    key2->increment_use_count(key1->use_count+1);
-	  }
-	  else
-	    key1=key1->insert(key2);		// Will destroy key2_root
-	  key2=next;
-	  continue;
-	}
+        /*
+          This is the case:
+          key2:  [------**]
+          tmp:             [----]
+        */
+        if (tmp_cmp == 2 && eq_tree(tmp->next_key_part,key2->next_key_part))
+        {
+          /*
+            Adjacent ranges with equal next_key_part. Merge like this:
+
+            This is the case:
+            key2:    [------]
+            tmp:             [-----]
+
+            Result:
+            key2:    [------]
+            tmp:     [-------------]
+
+            Then move on to next key2 range.
+          */
+          tmp->copy_min_to_min(key2);
+          key1->merge_flags(key2);
+          if (tmp->min_flag & NO_MIN_RANGE &&
+              tmp->max_flag & NO_MAX_RANGE)
+          {
+            if (key1->maybe_flag)
+              return new SEL_ARG(SEL_ARG::MAYBE_KEY);
+            return 0;
+          }
+          key2->increment_use_count(-1);        // Free not used tree
+          key2=key2->next;
+          continue;
+        }
+        else
+        {
+          /*
+            key2 not adjacent to tmp or has different next_key_part.
+            Insert into key1 and move to next range in key2
+            
+            This is the case:
+            key2:  [------**]
+            tmp:             [----]
+
+            Result:
+            key1_  [------**][----]
+                   ^         ^
+                   insert    tmp
+          */
+          SEL_ARG *next=key2->next;
+          if (key2_shared)
+          {
+            SEL_ARG *cpy= new SEL_ARG(*key2);   // Must make copy
+            if (!cpy)
+              return 0;                         // OOM
+            key1=key1->insert(cpy);
+            key2->increment_use_count(key1->use_count+1);
+          }
+          else
+            key1=key1->insert(key2);            // Will destroy key2_root
+          key2=next;
+          continue;
+        }
       }
     }
 
-    /* 
-      tmp.min >= key2.min && tmp.min <= key.max  (overlapping ranges)
-      key2.min <= tmp.min <= key2.max 
-    */  
+    /*
+      The ranges in tmp and key2 are overlapping:
+
+      key2:          [----------] 
+      tmp:        [*****-----*****]
+
+      Corollary: tmp.min <= key2.max
+    */
     if (eq_tree(tmp->next_key_part,key2->next_key_part))
     {
+      // Merge overlapping ranges with equal next_key_part
       if (tmp->is_same(key2))
       {
-        /* 
-          Found exact match of key2 inside key1. 
+        /*
+          Found exact match of key2 inside key1.
           Use the relevant range in key1.
         */
-	tmp->merge_flags(key2);			// Copy maybe flags
-	key2->increment_use_count(-1);		// Free not used tree
+        tmp->merge_flags(key2);                 // Copy maybe flags
+        key2->increment_use_count(-1);          // Free not used tree
       }
       else
       {
-	SEL_ARG *last=tmp;
-        SEL_ARG *first=tmp;
-        /* 
-          Find the last range in tmp that overlaps key2 and has the same 
-          condition on the rest of the keyparts.
+        SEL_ARG *last= tmp;
+        SEL_ARG *first= tmp;
+
+        /*
+          Find the last range in key1 that overlaps key2 and
+          where all ranges first...last have the same next_key_part as
+          key2.
+
+          key2:  [****----------------------*******]
+          key1:     [--]  [----] [---]  [-----] [xxxx]
+                    ^                   ^       ^
+                    first               last    different next_key_part
+
+          Since key2 covers them, the ranges between first and last
+          are merged into one range by deleting first...last-1 from
+          the key1 tree. In the figure, this applies to first and the
+          two consecutive ranges. The range of last is then extended:
+            * last.min: Set to min(key2.min, first.min)
+            * last.max: If there is a last->next that overlaps key2 (i.e.,
+                        last->next has a different next_key_part):
+                                        Set adjacent to last->next.min
+                        Otherwise:      Set to max(key2.max, last.max)
+
+          Result:
+          key2:  [****----------------------*******]
+                    [--]  [----] [---]                   => deleted from key1
+          key1:  [**------------------------***][xxxx]
+                 ^                              ^
+                 tmp=last                       different next_key_part
         */
-	while (last->next && last->next->cmp_min_to_max(key2) <= 0 &&
-	       eq_tree(last->next->next_key_part,key2->next_key_part))
-	{
+        while (last->next && last->next->cmp_min_to_max(key2) <= 0 &&
+               eq_tree(last->next->next_key_part,key2->next_key_part))
+        {
           /*
-            We've found the last overlapping key1 range in last.
-            This means that the ranges between (and including) the 
-            first overlapping range (tmp) and the last overlapping range
-            (last) are fully nested into the current range of key2 
-            and can safely be discarded. We just need the minimum endpoint
-            of the first overlapping range (tmp) so we can compare it with
-            the minimum endpoint of the enclosing key2 range.
+            last->next is covered by key2 and has same next_key_part.
+            last can be deleted
           */
-	  SEL_ARG *save=last;
-	  last=last->next;
-	  key1=key1->tree_delete(save);
-	}
+          SEL_ARG *save=last;
+          last=last->next;
+          key1=key1->tree_delete(save);
+        }
+        // Redirect tmp to last which will cover the entire range
+        tmp= last;
+
         /*
-          The tmp range (the first overlapping range) could have been discarded
-          by the previous loop. We should re-direct tmp to the new united range 
-          that's taking its place.
+          We need the minimum endpoint of first so we can compare it
+          with the minimum endpoint of the enclosing key2 range.
         */
-        tmp= last;
         last->copy_min(first);
         bool full_range= last->copy_min(key2);
         if (!full_range)
         {
           if (last->next && key2->cmp_max_to_min(last->next) >= 0)
           {
-            last->max_value= last->next->min_value;
-            if (last->next->min_flag & NEAR_MIN)
-              last->max_flag&= ~NEAR_MAX;
-            else
-              last->max_flag|= NEAR_MAX;
+            /*
+              This is the case:
+              key2:    [-------------]
+              key1:  [***------]  [xxxx]
+                     ^            ^
+                     last         different next_key_part
+
+              Extend range of last up to last->next:
+              key2:    [-------------]
+              key1:  [***--------][xxxx]
+            */
+            last->copy_min_to_max(last->next);
           }
           else
+            /*
+              This is the case:
+              key2:    [--------*****]
+              key1:  [***---------]    [xxxx]
+                     ^                 ^
+                     last              different next_key_part
+
+              Extend range of last up to max(last.max, key2.max):
+              key2:    [--------*****]
+              key1:  [***----------**] [xxxx]
+             */
             full_range= last->copy_max(key2);
         }
-	if (full_range)
-	{					// Full range
-	  key1->free_tree();
-	  for (; key2 ; key2=key2->next)
-	    key2->increment_use_count(-1);	// Free not used tree
-	  if (key1->maybe_flag)
-	    return new SEL_ARG(SEL_ARG::MAYBE_KEY);
-	  return 0;
-	}
+        if (full_range)
+        {                                       // Full range
+          key1->free_tree();
+          for (; key2 ; key2=key2->next)
+            key2->increment_use_count(-1);      // Free not used tree
+          if (key1->maybe_flag)
+            return new SEL_ARG(SEL_ARG::MAYBE_KEY);
+          return 0;
+        }
       }
     }
 
     if (cmp >= 0 && tmp->cmp_min_to_min(key2) < 0)
-    {						// tmp.min <= x < key2.min
+    {
+      /*
+        This is the case ("cmp>=0" means that tmp.max >= key2.min):
+        key2:              [----]
+        tmp:     [------------*****]
+      */
+
+      if (!tmp->next_key_part)
+      {
+        /*
+          tmp->next_key_part is empty: cut the range that is covered
+          by tmp from key2. 
+          Reason: (key2->next_key_part OR tmp->next_key_part) will be
+          empty and therefore equal to tmp->next_key_part. Thus, this
+          part of the key2 range is completely covered by tmp.
+        */
+        if (tmp->cmp_max_to_max(key2) >= 0)
+        {
+          /*
+            tmp covers the entire range in key2. 
+            key2:              [----]
+            tmp:     [-----------------]
+
+            Move on to next range in key2
+          */
+          key2->increment_use_count(-1); // Free not used tree
+          key2=key2->next;
+          continue;
+        }
+        else
+        {
+          /*
+            This is the case:
+            key2:           [-------]
+            tmp:     [---------]
+
+            Result:
+            key2:               [---]
+            tmp:     [---------]
+          */
+          key2->copy_max_to_min(tmp);
+          continue;
+        }
+      }
+
+      /*
+        The ranges are overlapping but have not been merged because
+        next_key_part of tmp and key2 differ. 
+        key2:              [----]
+        tmp:     [------------*****]
+
+        Split tmp in two where key2 starts:
+        key2:              [----]
+        key1:    [--------][--*****]
+                 ^         ^
+                 insert    tmp
+      */
       SEL_ARG *new_arg=tmp->clone_first(key2);
       if (!new_arg)
-	return 0;				// OOM
-      if ((new_arg->next_key_part= key1->next_key_part))
-	new_arg->increment_use_count(key1->use_count+1);
+        return 0;                               // OOM
+      if ((new_arg->next_key_part= tmp->next_key_part))
+        new_arg->increment_use_count(key1->use_count+1);
       tmp->copy_min_to_min(key2);
       key1=key1->insert(new_arg);
-    }
+    } // tmp.min >= key2.min due to this if()
 
-    // tmp.min >= key2.min && tmp.min <= key2.max
-    SEL_ARG key(*key2);				// Get copy we can modify
+    /*
+      Now key2.min <= tmp.min <= key2.max:
+      key2:   [---------]
+      tmp:    [****---*****]
+     */
+    SEL_ARG key2_cpy(*key2); // Get copy we can modify
     for (;;)
     {
-      if (tmp->cmp_min_to_min(&key) > 0)
-      {						// key.min <= x < tmp.min
-	SEL_ARG *new_arg=key.clone_first(tmp);
-	if (!new_arg)
-	  return 0;				// OOM
-	if ((new_arg->next_key_part=key.next_key_part))
-	  new_arg->increment_use_count(key1->use_count+1);
-	key1=key1->insert(new_arg);
-      }
-      if ((cmp=tmp->cmp_max_to_max(&key)) <= 0)
-      {						// tmp.min. <= x <= tmp.max
-	tmp->maybe_flag|= key.maybe_flag;
-	key.increment_use_count(key1->use_count+1);
-	tmp->next_key_part= key_or(param, tmp->next_key_part, key.next_key_part);
-	if (!cmp)				// Key2 is ready
-	  break;
-	key.copy_max_to_min(tmp);
-	if (!(tmp=tmp->next))
-	{
-	  SEL_ARG *tmp2= new SEL_ARG(key);
-	  if (!tmp2)
-	    return 0;				// OOM
-	  key1=key1->insert(tmp2);
-	  key2=key2->next;
-	  goto end;
-	}
-	if (tmp->cmp_min_to_max(&key) > 0)
-	{
-	  SEL_ARG *tmp2= new SEL_ARG(key);
-	  if (!tmp2)
-	    return 0;				// OOM
-	  key1=key1->insert(tmp2);
-	  break;
-	}
+      if (tmp->cmp_min_to_min(&key2_cpy) > 0)
+      {
+        /*
+          This is the case:
+          key2_cpy:    [------------]
+          key1:                 [-*****]
+                                ^
+                                tmp
+                             
+          Result:
+          key2_cpy:             [---]
+          key1:        [-------][-*****]
+                       ^        ^
+                       insert   tmp
+         */
+        SEL_ARG *new_arg=key2_cpy.clone_first(tmp);
+        if (!new_arg)
+          return 0; // OOM
+        if ((new_arg->next_key_part=key2_cpy.next_key_part))
+          new_arg->increment_use_count(key1->use_count+1);
+        key1=key1->insert(new_arg);
+        key2_cpy.copy_min_to_min(tmp);
+      } 
+      // Now key2_cpy.min == tmp.min
+
+      if ((cmp= tmp->cmp_max_to_max(&key2_cpy)) <= 0)
+      {
+        /*
+          tmp.max <= key2_cpy.max:
+          key2_cpy:   a)  [-------]    or b)     [----]
+          tmp:            [----]                 [----]
+
+          Steps:
+           1) Update next_key_part of tmp: OR it with key2_cpy->next_key_part.
+           2) If case a: Insert range [tmp.max, key2_cpy.max] into key1 using
+                         next_key_part of key2_cpy
+
+           Result:
+           key1:      a)  [----][-]    or b)     [----]
+         */
+        tmp->maybe_flag|= key2_cpy.maybe_flag;
+        key2_cpy.increment_use_count(key1->use_count+1);
+        tmp->next_key_part= key_or(param, tmp->next_key_part,
+                                   key2_cpy.next_key_part);
+
+        if (!cmp)
+          break;                     // case b: done with this key2 range
+
+        // Make key2_cpy the range [tmp.max, key2_cpy.max]
+        key2_cpy.copy_max_to_min(tmp);
+        if (!(tmp=tmp->next))
+        {
+          /*
+            No more ranges in key1. Insert key2_cpy and go to "end"
+            label to insert remaining ranges in key2 if any.
+          */
+          SEL_ARG *tmp2= new SEL_ARG(key2_cpy);
+          if (!tmp2)
+            return 0; // OOM
+          key1=key1->insert(tmp2);
+          key2=key2->next;
+          goto end;
+        }
+        if (tmp->cmp_min_to_max(&key2_cpy) > 0)
+        {
+          /*
+            The next range in key1 does not overlap with key2_cpy.
+            Insert this range into key1 and move on to the next range
+            in key2.
+          */
+          SEL_ARG *tmp2= new SEL_ARG(key2_cpy);
+          if (!tmp2)
+            return 0;                           // OOM
+          key1=key1->insert(tmp2);
+          break;
+        }
+        /*
+          key2_cpy overlaps with the next range in key1 and the case
+          is now "key2.min <= tmp.min <= key2.max". Go back to for(;;)
+          to handle this situation.
+        */
+        continue;
       }
       else
       {
-	SEL_ARG *new_arg=tmp->clone_last(&key); // tmp.min <= x <= key.max
-	if (!new_arg)
-	  return 0;				// OOM
-	tmp->copy_max_to_min(&key);
-	tmp->increment_use_count(key1->use_count+1);
-	/* Increment key count as it may be used for next loop */
-	key.increment_use_count(1);
-	new_arg->next_key_part= key_or(param, tmp->next_key_part, key.next_key_part);
-	key1=key1->insert(new_arg);
-	break;
+        /*
+          This is the case:
+          key2_cpy:   [-------]
+          tmp:        [------------]
+
+          Result:
+          key1:       [-------][---]
+                      ^        ^
+                      new_arg  tmp
+          Steps:
+           0) If tmp->next_key_part is empty: do nothing. Reason:
+              (key2_cpy->next_key_part OR tmp->next_key_part) will be
+              empty and therefore equal to tmp->next_key_part. Thus,
+              the range in key2_cpy is completely covered by tmp
+           1) Make new_arg with range [tmp.min, key2_cpy.max].
+              new_arg->next_key_part is OR between next_key_part
+              of tmp and key2_cpy
+           2) Make tmp the range [key2.max, tmp.max]
+           3) Insert new_arg into key1
+        */
+        if (!tmp->next_key_part) // Step 0
+        {
+          key2_cpy.increment_use_count(-1);     // Free not used tree
+          break;
+        }
+        SEL_ARG *new_arg=tmp->clone_last(&key2_cpy);
+        if (!new_arg)
+          return 0; // OOM
+        tmp->copy_max_to_min(&key2_cpy);
+        tmp->increment_use_count(key1->use_count+1);
+        /* Increment key count as it may be used for next loop */
+        key2_cpy.increment_use_count(1);
+        new_arg->next_key_part= key_or(param, tmp->next_key_part,
+                                       key2_cpy.next_key_part);
+        key1=key1->insert(new_arg);
+        break;
       }
     }
-    key2=key2->next;
+    // Move on to next range in key2
+    key2=key2->next;                            
   }
 
 end:
+  /*
+    Add key2 ranges that are non-overlapping with and higher than the
+    highest range in key1.
+  */
   while (key2)
   {
     SEL_ARG *next=key2->next;
     if (key2_shared)
     {
-      SEL_ARG *tmp=new SEL_ARG(*key2);		// Must make copy
+      SEL_ARG *tmp=new SEL_ARG(*key2);          // Must make copy
       if (!tmp)
-	return 0;
+        return 0;
       key2->increment_use_count(key1->use_count+1);
       key1=key1->insert(tmp);
     }
     else
-      key1=key1->insert(key2);			// Will destroy key2_root
+      key1=key1->insert(key2);                  // Will destroy key2_root
     key2=next;
   }
   key1->use_count++;
+
+  key1->max_part_no= max_part_no;
   return key1;
 }
 
@@ -7478,11 +9828,7 @@ static ulong count_key_part_usage(SEL_ARG *root, SEL_ARG *key)
 void SEL_ARG::test_use_count(SEL_ARG *root)
 {
   uint e_count=0;
-  if (this == root && use_count != 1)
-  {
-    sql_print_information("Use_count: Wrong count %lu for root",use_count);
-    return;
-  }
+
   if (this->type != SEL_ARG::KEY_RANGE)
     return;
   for (SEL_ARG *pos=first(); pos ; pos=pos->next)
@@ -7540,9 +9886,9 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
                            uint *mrr_flags, uint *bufsize, COST_VECT *cost)
 {
   SEL_ARG_RANGE_SEQ seq;
-  RANGE_SEQ_IF seq_if = {sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0};
+  RANGE_SEQ_IF seq_if = {NULL, sel_arg_range_seq_init, sel_arg_range_seq_next, 0, 0};
   handler *file= param->table->file;
-  ha_rows rows;
+  ha_rows rows= HA_POS_ERROR;
   uint keynr= param->real_keynr[idx];
   DBUG_ENTER("check_quick_select");
   
@@ -7575,25 +9921,31 @@ ha_rows check_quick_select(PARAM *param, uint idx, bool index_only,
   bool pk_is_clustered= file->primary_key_is_clustered();
   if (index_only && 
       (file->index_flags(keynr, param->max_key_part, 1) & HA_KEYREAD_ONLY) &&
-      !(pk_is_clustered && keynr == param->table->s->primary_key))
+      !(file->index_flags(keynr, param->max_key_part, 1) & HA_CLUSTERED_INDEX))
      *mrr_flags |= HA_MRR_INDEX_ONLY;
   
-  if (current_thd->lex->sql_command != SQLCOM_SELECT)
+  if (param->thd->lex->sql_command != SQLCOM_SELECT)
     *mrr_flags |= HA_MRR_USE_DEFAULT_IMPL;
 
   *bufsize= param->thd->variables.mrr_buff_size;
-  rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0,
-                                          bufsize, mrr_flags, cost);
+  /*
+    Skip materialized derived table/view result table from MRR check as
+    they aren't contain any data yet.
+  */
+  if (param->table->pos_in_table_list->is_non_derived())
+    rows= file->multi_range_read_info_const(keynr, &seq_if, (void*)&seq, 0,
+                                            bufsize, mrr_flags, cost);
   if (rows != HA_POS_ERROR)
   {
-    param->table->quick_rows[keynr]=rows;
+    param->quick_rows[keynr]= rows;
     if (update_tbl_stats)
     {
       param->table->quick_keys.set_bit(keynr);
-      param->table->quick_key_parts[keynr]=param->max_key_part+1;
+      param->table->quick_key_parts[keynr]= param->max_key_part+1;
       param->table->quick_n_ranges[keynr]= param->range_count;
       param->table->quick_condition_rows=
         min(param->table->quick_condition_rows, rows);
+      param->table->quick_rows[keynr]= rows;
     }
   }
   /* Figure out if the key scan is ROR (returns rows in ROWID order) or not */
@@ -7940,7 +10292,7 @@ bool QUICK_SELECT_I::is_keys_used(const MY_BITMAP *fields)
   return is_key_used(head, index, fields);
 }
 
-bool QUICK_INDEX_MERGE_SELECT::is_keys_used(const MY_BITMAP *fields)
+bool QUICK_INDEX_SORT_SELECT::is_keys_used(const MY_BITMAP *fields)
 {
   QUICK_RANGE_SELECT *quick;
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
@@ -7954,11 +10306,11 @@ bool QUICK_INDEX_MERGE_SELECT::is_keys_used(const MY_BITMAP *fields)
 
 bool QUICK_ROR_INTERSECT_SELECT::is_keys_used(const MY_BITMAP *fields)
 {
-  QUICK_RANGE_SELECT *quick;
-  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
-  while ((quick= it++))
+  QUICK_SELECT_WITH_RECORD *qr;
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+  while ((qr= it++))
   {
-    if (is_key_used(head, quick->index, fields))
+    if (is_key_used(head, qr->quick->index, fields))
       return 1;
   }
   return 0;
@@ -8094,6 +10446,7 @@ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
 
   quick->mrr_buf_size= thd->variables.mrr_buff_size;
   if (table->file->multi_range_read_info(quick->index, 1, (uint)records,
+                                         ~0, 
                                          &quick->mrr_buf_size,
                                          &quick->mrr_flags, &cost))
     goto err;
@@ -8122,13 +10475,23 @@ err:
     other error
 */
 
-int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
+int read_keys_and_merge_scans(THD *thd,
+                              TABLE *head,
+                              List<QUICK_RANGE_SELECT> quick_selects,
+                              QUICK_RANGE_SELECT *pk_quick_select,
+                              READ_RECORD *read_record,
+                              bool intersection,
+                              key_map *filtered_scans,
+                              Unique **unique_ptr)
 {
   List_iterator_fast<QUICK_RANGE_SELECT> cur_quick_it(quick_selects);
   QUICK_RANGE_SELECT* cur_quick;
   int result;
+  Unique *unique= *unique_ptr;
   handler *file= head->file;
-  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::read_keys_and_merge");
+  bool with_cpk_filter= pk_quick_select != NULL;
+
+  DBUG_ENTER("read_keys_and_merge");
 
   /* We're going to just read rowids. */
   if (!head->key_read)
@@ -8139,6 +10502,7 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
 
   cur_quick_it.rewind();
   cur_quick= cur_quick_it++;
+  bool first_quick= TRUE;
   DBUG_ASSERT(cur_quick != 0);
   
   /*
@@ -8156,9 +10520,11 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
 
     unique= new Unique(refpos_order_cmp, (void *)file,
                        file->ref_length,
-                       thd->variables.sortbuff_size);
+                       thd->variables.sortbuff_size,
+		       intersection ? quick_selects.elements : 0);                     
     if (!unique)
       goto err;
+    *unique_ptr= unique;
   }
   else
     unique->reset();
@@ -8170,6 +10536,14 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
   {
     while ((result= cur_quick->get_next()) == HA_ERR_END_OF_FILE)
     {
+      if (intersection)
+        with_cpk_filter= filtered_scans->is_set(cur_quick->index);
+      if (first_quick)
+      {
+        first_quick= FALSE;
+        if (intersection && unique->is_in_memory())
+          unique->close_for_expansion();
+      }
       cur_quick->range_end();
       cur_quick= cur_quick_it++;
       if (!cur_quick)
@@ -8194,8 +10568,8 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
     if (thd->killed)
       goto err;
 
-    /* skip row if it will be retrieved by clustered PK scan */
-    if (pk_quick_select && pk_quick_select->row_in_ranges())
+    if (with_cpk_filter &&
+        pk_quick_select->row_in_ranges() != intersection )
       continue;
 
     cur_quick->file->position(cur_quick->record);
@@ -8209,14 +10583,13 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
     sequence.
   */
   result= unique->get(head);
-  doing_pk_scan= FALSE;
   /*
-    index_merge currently doesn't support "using index" at all
+    index merge currently doesn't support "using index" at all
   */
   head->disable_keyread();
-  if (init_read_record(&read_record, thd, head, (SQL_SELECT*) 0, 1 , 1, TRUE))
+  if (init_read_record(read_record, thd, head, (SQL_SELECT*) 0, 1 , 1, TRUE))
     result= 1;
-  DBUG_RETURN(result);
+ DBUG_RETURN(result);
 
 err:
   head->disable_keyread();
@@ -8224,6 +10597,17 @@ err:
 }
 
 
+int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge()
+
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::read_keys_and_merge");
+  result= read_keys_and_merge_scans(thd, head, quick_selects, pk_quick_select,
+                                    &read_record, FALSE, NULL, &unique);
+  doing_pk_scan= FALSE;
+  DBUG_RETURN(result);
+}
+
 /*
   Get next row for index_merge.
   NOTES
@@ -8260,6 +10644,32 @@ int QUICK_INDEX_MERGE_SELECT::get_next()
   DBUG_RETURN(result);
 }
 
+int QUICK_INDEX_INTERSECT_SELECT::read_keys_and_merge()
+
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_INTERSECT_SELECT::read_keys_and_merge");
+  result= read_keys_and_merge_scans(thd, head, quick_selects, pk_quick_select,
+                                    &read_record, TRUE, &filtered_scans,
+                                    &unique);
+  DBUG_RETURN(result);
+}
+
+int QUICK_INDEX_INTERSECT_SELECT::get_next()
+{
+  int result;
+  DBUG_ENTER("QUICK_INDEX_INTERSECT_SELECT::get_next");
+
+  if ((result= read_record.read_record(&read_record)) == -1)
+  {
+    result= HA_ERR_END_OF_FILE;
+    end_read_record(&read_record);
+    free_io_cache(head);
+  }
+
+  DBUG_RETURN(result);
+}
+
 
 /*
   Retrieve next record.
@@ -8283,7 +10693,8 @@ int QUICK_INDEX_MERGE_SELECT::get_next()
 
 int QUICK_ROR_INTERSECT_SELECT::get_next()
 {
-  List_iterator_fast<QUICK_RANGE_SELECT> quick_it(quick_selects);
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> quick_it(quick_selects);
+  QUICK_SELECT_WITH_RECORD *qr;
   QUICK_RANGE_SELECT* quick;
   int error, cmp;
   uint last_rowid_count=0;
@@ -8292,7 +10703,8 @@ int QUICK_ROR_INTERSECT_SELECT::get_next()
   do
   {
     /* Get a rowid for first quick and save it as a 'candidate' */
-    quick= quick_it++;
+    qr= quick_it++;
+    quick= qr->quick;
     error= quick->get_next();
     if (cpk_quick)
     {
@@ -8302,17 +10714,22 @@ int QUICK_ROR_INTERSECT_SELECT::get_next()
     if (error)
       DBUG_RETURN(error);
 
+    /* Save the read key tuple */
+    key_copy(qr->key_tuple, record, head->key_info + quick->index,
+             quick->max_used_key_length);
+
     quick->file->position(quick->record);
     memcpy(last_rowid, quick->file->ref, head->file->ref_length);
     last_rowid_count= 1;
 
     while (last_rowid_count < quick_selects.elements)
     {
-      if (!(quick= quick_it++))
+      if (!(qr= quick_it++))
       {
         quick_it.rewind();
-        quick= quick_it++;
+        qr= quick_it++;
       }
+      quick= qr->quick;
 
       do
       {
@@ -8322,6 +10739,9 @@ int QUICK_ROR_INTERSECT_SELECT::get_next()
         cmp= head->file->cmp_ref(quick->file->ref, last_rowid);
       } while (cmp < 0);
 
+      key_copy(qr->key_tuple, record, head->key_info + quick->index,
+               quick->max_used_key_length);
+
       /* Ok, current select 'caught up' and returned ref >= cur_ref */
       if (cmp > 0)
       {
@@ -8337,6 +10757,10 @@ int QUICK_ROR_INTERSECT_SELECT::get_next()
         }
         memcpy(last_rowid, quick->file->ref, head->file->ref_length);
         last_rowid_count= 1;
+
+        //save the fields here
+        key_copy(qr->key_tuple, record, head->key_info + quick->index,
+                 quick->max_used_key_length);
       }
       else
       {
@@ -8349,6 +10773,21 @@ int QUICK_ROR_INTERSECT_SELECT::get_next()
     if (need_to_fetch_row)
       error= head->file->ha_rnd_pos(head->record[0], last_rowid);
   } while (error == HA_ERR_RECORD_DELETED);
+
+  if (!need_to_fetch_row)
+  {
+    /* Restore the columns we've read/saved with other quick selects */
+    quick_it.rewind();
+    while ((qr= quick_it++))
+    {
+      if (qr->quick != quick)
+      {
+        key_restore(record, qr->key_tuple, head->key_info + qr->quick->index,
+                    qr->quick->max_used_key_length);
+      }
+    }
+  }
+
   DBUG_RETURN(error);
 }
 
@@ -8469,7 +10908,7 @@ int QUICK_RANGE_SELECT::reset()
   if (!mrr_buf_desc)
     empty_buf.buffer= empty_buf.buffer_end= empty_buf.end_of_used_area= NULL;
  
-  RANGE_SEQ_IF seq_funcs= {quick_range_seq_init, quick_range_seq_next, 0, 0};
+  RANGE_SEQ_IF seq_funcs= {NULL, quick_range_seq_init, quick_range_seq_next, 0, 0};
   error= file->multi_range_read_init(&seq_funcs, (void*)this, ranges.elements,
                                      mrr_flags, mrr_buf_desc? mrr_buf_desc: 
                                                               &empty_buf);
@@ -8494,7 +10933,7 @@ int QUICK_RANGE_SELECT::reset()
 
 int QUICK_RANGE_SELECT::get_next()
 {
-  char *dummy;
+  range_id_t dummy;
   DBUG_ENTER("QUICK_RANGE_SELECT::get_next");
   if (in_ror_merged_scan)
   {
@@ -8893,30 +11332,53 @@ bool QUICK_SELECT_DESC::range_reads_after_key(QUICK_RANGE *range_arg)
 }
 
 
-void QUICK_RANGE_SELECT::add_info_string(String *str)
+void QUICK_SELECT_I::add_key_name(String *str, bool *first)
 {
   KEY *key_info= head->key_info + index;
+
+  if (*first)
+    *first= FALSE;
+  else
+    str->append(',');
   str->append(key_info->name);
 }
+ 
+
+void QUICK_RANGE_SELECT::add_info_string(String *str)
+{
+  bool first= TRUE;
+  
+  add_key_name(str, &first);
+}
 
 void QUICK_INDEX_MERGE_SELECT::add_info_string(String *str)
 {
   QUICK_RANGE_SELECT *quick;
   bool first= TRUE;
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
   str->append(STRING_WITH_LEN("sort_union("));
   while ((quick= it++))
   {
-    if (!first)
-      str->append(',');
-    else
-      first= FALSE;
-    quick->add_info_string(str);
+    quick->add_key_name(str, &first);
   }
   if (pk_quick_select)
+    pk_quick_select->add_key_name(str, &first);
+  str->append(')');
+}
+
+void QUICK_INDEX_INTERSECT_SELECT::add_info_string(String *str)
+{
+  QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
+  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
+  str->append(STRING_WITH_LEN("sort_intersect("));
+  if (pk_quick_select)
+    pk_quick_select->add_key_name(str, &first);
+  while ((quick= it++))
   {
-    str->append(',');
-    pk_quick_select->add_info_string(str);
+    quick->add_key_name(str, &first);
   }
   str->append(')');
 }
@@ -8924,132 +11386,127 @@ void QUICK_INDEX_MERGE_SELECT::add_info_string(String *str)
 void QUICK_ROR_INTERSECT_SELECT::add_info_string(String *str)
 {
   bool first= TRUE;
-  QUICK_RANGE_SELECT *quick;
-  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+  QUICK_SELECT_WITH_RECORD *qr;
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+
   str->append(STRING_WITH_LEN("intersect("));
-  while ((quick= it++))
+  while ((qr= it++))
   {
-    KEY *key_info= head->key_info + quick->index;
-    if (!first)
-      str->append(',');
-    else
-      first= FALSE;
-    str->append(key_info->name);
+    qr->quick->add_key_name(str, &first);
   }
   if (cpk_quick)
-  {
-    KEY *key_info= head->key_info + cpk_quick->index;
-    str->append(',');
-    str->append(key_info->name);
-  }
+    cpk_quick->add_key_name(str, &first);
   str->append(')');
 }
 
+
 void QUICK_ROR_UNION_SELECT::add_info_string(String *str)
 {
-  bool first= TRUE;
   QUICK_SELECT_I *quick;
+  bool first= TRUE;
   List_iterator_fast<QUICK_SELECT_I> it(quick_selects);
+
   str->append(STRING_WITH_LEN("union("));
   while ((quick= it++))
   {
-    if (!first)
-      str->append(',');
-    else
+    if (first)
       first= FALSE;
+    else
+      str->append(',');
     quick->add_info_string(str);
   }
   str->append(')');
 }
 
 
-void QUICK_RANGE_SELECT::add_keys_and_lengths(String *key_names,
-                                              String *used_lengths)
+void QUICK_SELECT_I::add_key_and_length(String *key_names,
+                                        String *used_lengths,
+                                        bool *first)
 {
   char buf[64];
   uint length;
   KEY *key_info= head->key_info + index;
+
+  if (*first)
+    *first= FALSE;
+  else
+  {
+    key_names->append(',');
+    used_lengths->append(',');
+  }
   key_names->append(key_info->name);
   length= longlong10_to_str(max_used_key_length, buf, 10) - buf;
   used_lengths->append(buf, length);
 }
 
+
+void QUICK_RANGE_SELECT::add_keys_and_lengths(String *key_names,
+                                              String *used_lengths)
+{
+  bool first= TRUE;
+
+  add_key_and_length(key_names, used_lengths, &first);
+}
+
 void QUICK_INDEX_MERGE_SELECT::add_keys_and_lengths(String *key_names,
                                                     String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  bool first= TRUE;
   QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
 
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
   while ((quick= it++))
   {
-    if (first)
-      first= FALSE;
-    else
-    {
-      key_names->append(',');
-      used_lengths->append(',');
-    }
-
-    KEY *key_info= head->key_info + quick->index;
-    key_names->append(key_info->name);
-    length= longlong10_to_str(quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(buf, length);
+    quick->add_key_and_length(key_names, used_lengths, &first);
   }
+
   if (pk_quick_select)
-  {
-    KEY *key_info= head->key_info + pk_quick_select->index;
-    key_names->append(',');
-    key_names->append(key_info->name);
-    length= (longlong10_to_str(pk_quick_select->max_used_key_length, buf, 10)
-             - buf);
-    used_lengths->append(',');
-    used_lengths->append(buf, length);
-  }
+    pk_quick_select->add_key_and_length(key_names, used_lengths, &first);
 }
 
-void QUICK_ROR_INTERSECT_SELECT::add_keys_and_lengths(String *key_names,
-                                                      String *used_lengths)
+
+void QUICK_INDEX_INTERSECT_SELECT::add_keys_and_lengths(String *key_names,
+                                                        String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  bool first= TRUE;
   QUICK_RANGE_SELECT *quick;
+  bool first= TRUE;
+
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
+
+  if (pk_quick_select)
+    pk_quick_select->add_key_and_length(key_names, used_lengths, &first);
+
   while ((quick= it++))
   {
-    KEY *key_info= head->key_info + quick->index;
-    if (first)
-      first= FALSE;
-    else
-    {
-      key_names->append(',');
-      used_lengths->append(',');
-    }
-    key_names->append(key_info->name);
-    length= longlong10_to_str(quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(buf, length);
+    quick->add_key_and_length(key_names, used_lengths, &first);
   }
+}
 
-  if (cpk_quick)
+void QUICK_ROR_INTERSECT_SELECT::add_keys_and_lengths(String *key_names,
+                                                      String *used_lengths)
+{
+  QUICK_SELECT_WITH_RECORD *qr;
+  bool first= TRUE;
+
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+
+  while ((qr= it++))
   {
-    KEY *key_info= head->key_info + cpk_quick->index;
-    key_names->append(',');
-    key_names->append(key_info->name);
-    length= longlong10_to_str(cpk_quick->max_used_key_length, buf, 10) - buf;
-    used_lengths->append(',');
-    used_lengths->append(buf, length);
+    qr->quick->add_key_and_length(key_names, used_lengths, &first);
   }
+  if (cpk_quick)
+    cpk_quick->add_key_and_length(key_names, used_lengths, &first);
 }
 
 void QUICK_ROR_UNION_SELECT::add_keys_and_lengths(String *key_names,
                                                   String *used_lengths)
 {
-  bool first= TRUE;
   QUICK_SELECT_I *quick;
+  bool first= TRUE;
+
   List_iterator_fast<QUICK_SELECT_I> it(quick_selects);
+
   while ((quick= it++))
   {
     if (first)
@@ -9078,7 +11535,7 @@ static bool get_constant_key_infix(KEY *index_info, SEL_ARG *index_range_tree,
                        uchar *key_infix, uint *key_infix_len,
                        KEY_PART_INFO **first_non_infix_part);
 static bool
-check_group_min_max_predicates(COND *cond, Item_field *min_max_arg_item,
+check_group_min_max_predicates(Item *cond, Item_field *min_max_arg_item,
                                Field::imagetype image_type);
 
 static void
@@ -9241,7 +11698,7 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree, double read_time)
   /* Perform few 'cheap' tests whether this access method is applicable. */
   if (!join)
     DBUG_RETURN(NULL);        /* This is not a select statement. */
-  if ((join->tables != 1) ||  /* The query must reference one table. */
+  if ((join->table_count != 1) ||  /* The query must reference one table. */
       (join->select_lex->olap == ROLLUP_TYPE)) /* Check (B3) for ROLLUP */
     DBUG_RETURN(NULL);
   if (table->s->keys == 0)        /* There are no indexes to use. */
@@ -9677,14 +12134,14 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree, double read_time)
 */
 
 static bool
-check_group_min_max_predicates(COND *cond, Item_field *min_max_arg_item,
+check_group_min_max_predicates(Item *cond, Item_field *min_max_arg_item,
                                Field::imagetype image_type)
 {
   DBUG_ENTER("check_group_min_max_predicates");
   DBUG_ASSERT(cond && min_max_arg_item);
 
   cond= cond->real_item();
-  Item::Type cond_type= cond->type();
+  Item::Type cond_type= cond->real_type();
   if (cond_type == Item::COND_ITEM) /* 'AND' or 'OR' */
   {
     DBUG_PRINT("info", ("Analyzing: %s", ((Item_func*) cond)->func_name()));
@@ -9700,18 +12157,27 @@ check_group_min_max_predicates(COND *cond, Item_field *min_max_arg_item,
   }
 
   /*
-    TODO:
-    This is a very crude fix to handle sub-selects in the WHERE clause
-    (Item_subselect objects). With the test below we rule out from the
-    optimization all queries with subselects in the WHERE clause. What has to
-    be done, is that here we should analyze whether the subselect references
-    the MIN/MAX argument field, and disallow the optimization only if this is
-    so.
+    Disallow loose index scan if the MIN/MAX argument field is referenced by
+    a subquery in the WHERE clause.
   */
-  if (cond_type == Item::SUBSELECT_ITEM ||
-      (cond->get_cached_item() &&
-       cond->get_cached_item()->type() == Item::SUBSELECT_ITEM))
-    DBUG_RETURN(FALSE);
+
+  if (cond_type == Item::SUBSELECT_ITEM)
+  {
+    Item_subselect *subs_cond= (Item_subselect*) cond;
+    if (subs_cond->is_correlated)
+    {
+      DBUG_ASSERT(subs_cond->upper_refs.elements > 0);
+      List_iterator_fast<Item_subselect::Ref_to_outside>
+        li(subs_cond->upper_refs);
+      Item_subselect::Ref_to_outside *dep;
+      while ((dep= li++))
+      {
+        if (dep->item->eq(min_max_arg_item, FALSE))
+          DBUG_RETURN(FALSE);
+      }
+    }
+    DBUG_RETURN(TRUE);
+  }
 
   /*
     Condition of the form 'field' is equivalent to 'field <> 0' and thus
@@ -9858,13 +12324,14 @@ get_constant_key_infix(KEY *index_info, SEL_ARG *index_range_tree,
       Find the range tree for the current keypart. We assume that
       index_range_tree points to the leftmost keypart in the index.
     */
-    for (cur_range= index_range_tree; cur_range;
+    for (cur_range= index_range_tree; 
+         cur_range && cur_range->type == SEL_ARG::KEY_RANGE;
          cur_range= cur_range->next_key_part)
     {
       if (cur_range->field->eq(cur_part->field))
         break;
     }
-    if (!cur_range)
+    if (!cur_range || cur_range->type != SEL_ARG::KEY_RANGE)
     {
       if (min_max_arg_part)
         return FALSE; /* The current keypart has no range predicates at all. */
@@ -11053,6 +13520,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range()
       /* Compare the found key with max_key. */
       int cmp_res= key_cmp(index_info->key_part, max_key,
                            real_prefix_len + min_max_arg_len);
+      my_afree(max_key);
       /*
         The key is outside of the range if: 
         the interval is open and the key is equal to the maximum boundry
@@ -11178,6 +13646,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_max_in_range()
       /* Compare the found key with min_key. */
       int cmp_res= key_cmp(index_info->key_part, min_key,
                            real_prefix_len + min_max_arg_len);
+      my_afree(min_key);
       /*
         The key is outside of the range if: 
         the interval is open and the key is equal to the minimum boundry
@@ -11278,11 +13747,9 @@ void QUICK_GROUP_MIN_MAX_SELECT::update_max_result()
 void QUICK_GROUP_MIN_MAX_SELECT::add_keys_and_lengths(String *key_names,
                                                       String *used_lengths)
 {
-  char buf[64];
-  uint length;
-  key_names->append(index_info->name);
-  length= longlong10_to_str(max_used_key_length, buf, 10) - buf;
-  used_lengths->append(buf, length);
+  bool first= TRUE;
+
+  add_key_and_length(key_names, used_lengths, &first);
 }
 
 
@@ -11313,7 +13780,8 @@ static void print_sel_tree(PARAM *param, SEL_TREE *tree, key_map *tree_map,
   if (!tmp.length())
     tmp.append(STRING_WITH_LEN("(empty)"));
 
-  DBUG_PRINT("info", ("SEL_TREE: 0x%lx (%s)  scans: %s", (long) tree, msg, tmp.ptr()));
+  DBUG_PRINT("info", ("SEL_TREE: 0x%lx (%s)  scans: %s", (long) tree, msg,
+                      tmp.c_ptr_safe()));
 
   DBUG_VOID_RETURN;
 }
@@ -11336,7 +13804,7 @@ static void print_ror_scans_arr(TABLE *table, const char *msg,
   }
   if (!tmp.length())
     tmp.append(STRING_WITH_LEN("(empty)"));
-  DBUG_PRINT("info", ("ROR key scans (%s): %s", msg, tmp.ptr()));
+  DBUG_PRINT("info", ("ROR key scans (%s): %s", msg, tmp.c_ptr()));
   DBUG_VOID_RETURN;
 }
 
@@ -11353,7 +13821,6 @@ print_key(KEY_PART *key_part, const uchar *key, uint used_length)
 {
   char buff[1024];
   const uchar *key_end= key+used_length;
-  String tmp(buff,sizeof(buff),&my_charset_bin);
   uint store_length;
   TABLE *table= key_part->field->table;
   my_bitmap_map *old_sets[2];
@@ -11362,6 +13829,7 @@ print_key(KEY_PART *key_part, const uchar *key, uint used_length)
 
   for (; key < key_end; key+=store_length, key_part++)
   {
+    String tmp(buff,sizeof(buff),&my_charset_bin);
     Field *field=      key_part->field;
     store_length= key_part->store_length;
 
@@ -11449,8 +13917,7 @@ void QUICK_RANGE_SELECT::dbug_dump(int indent, bool verbose)
   /* purecov: end */    
 }
 
-
-void QUICK_INDEX_MERGE_SELECT::dbug_dump(int indent, bool verbose)
+void QUICK_INDEX_SORT_SELECT::dbug_dump(int indent, bool verbose)
 {
   List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
   QUICK_RANGE_SELECT *quick;
@@ -11468,13 +13935,13 @@ void QUICK_INDEX_MERGE_SELECT::dbug_dump(int indent, bool verbose)
 
 void QUICK_ROR_INTERSECT_SELECT::dbug_dump(int indent, bool verbose)
 {
-  List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
-  QUICK_RANGE_SELECT *quick;
+  List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+  QUICK_SELECT_WITH_RECORD *qr;
   fprintf(DBUG_FILE, "%*squick ROR-intersect select, %scovering\n",
           indent, "", need_to_fetch_row? "":"non-");
   fprintf(DBUG_FILE, "%*smerged scans {\n", indent, "");
-  while ((quick= it++))
-    quick->dbug_dump(indent+2, verbose);
+  while ((qr= it++))
+    qr->quick->dbug_dump(indent+2, verbose);
   if (cpk_quick)
   {
     fprintf(DBUG_FILE, "%*sclustered PK quick:\n", indent, "");
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 1340f0a5525..38a47dffc30 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -288,7 +288,6 @@ public:
 
   virtual bool reverse_sorted() = 0;
   virtual bool unique_key_range() { return false; }
-  virtual bool clustered_pk_range() { return false; }
 
   /*
     Request that this quick select produces sorted output. Not all quick
@@ -298,12 +297,13 @@ public:
   virtual void need_sorted_output() = 0;
   enum {
     QS_TYPE_RANGE = 0,
-    QS_TYPE_INDEX_MERGE = 1,
-    QS_TYPE_RANGE_DESC = 2,
-    QS_TYPE_FULLTEXT   = 3,
-    QS_TYPE_ROR_INTERSECT = 4,
-    QS_TYPE_ROR_UNION = 5,
-    QS_TYPE_GROUP_MIN_MAX = 6
+    QS_TYPE_INDEX_INTERSECT = 1,
+    QS_TYPE_INDEX_MERGE = 2,
+    QS_TYPE_RANGE_DESC = 3,
+    QS_TYPE_FULLTEXT   = 4,
+    QS_TYPE_ROR_INTERSECT = 5,
+    QS_TYPE_ROR_UNION = 6,
+    QS_TYPE_GROUP_MIN_MAX = 7
   };
 
   /* Get type of this quick select - one of the QS_TYPE_* values */
@@ -329,6 +329,10 @@ public:
     Save ROWID of last retrieved row in file->ref. This used in ROR-merging.
   */
   virtual void save_last_pos(){};
+  
+  void add_key_and_length(String *key_names,
+                          String *used_lengths,
+                          bool *first);
 
   /*
     Append comma-separated list of keys this quick select uses to key_names;
@@ -338,13 +342,16 @@ public:
   virtual void add_keys_and_lengths(String *key_names,
                                     String *used_lengths)=0;
 
+  void add_key_name(String *str, bool *first);
+
   /*
     Append text representation of quick select structure (what and how is
     merged) to str. The result is added to "Extra" field in EXPLAIN output.
     This function is implemented only by quick selects that merge other quick
     selects output and/or can produce output suitable for merging.
   */
-  virtual void add_info_string(String *str) {};
+  virtual void add_info_string(String *str) {}
+
   /*
     Return 1 if any index used by this quick select
     uses field which is marked in passed bitmap.
@@ -400,7 +407,7 @@ typedef struct st_quick_range_seq_ctx
 } QUICK_RANGE_SEQ_CTX;
 
 range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags);
-uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
 
 
 /*
@@ -486,12 +493,23 @@ private:
                                               uint mrr_buf_size,
                                               MEM_ROOT *alloc);
   friend class QUICK_SELECT_DESC;
+  friend class QUICK_INDEX_SORT_SELECT;
   friend class QUICK_INDEX_MERGE_SELECT;
   friend class QUICK_ROR_INTERSECT_SELECT;
+  friend class QUICK_INDEX_INTERSECT_SELECT;
   friend class QUICK_GROUP_MIN_MAX_SELECT;
-  friend uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
+  friend bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range);
   friend range_seq_t quick_range_seq_init(void *init_param,
                                           uint n_ranges, uint flags);
+  friend 
+  int read_keys_and_merge_scans(THD *thd, TABLE *head,
+                                List<QUICK_RANGE_SELECT> quick_selects,
+                                QUICK_RANGE_SELECT *pk_quick_select,
+                                READ_RECORD *read_record,
+                                bool intersection,
+                                key_map *filtered_scans,
+                                Unique **unique_ptr);
+
 };
 
 
@@ -509,40 +527,43 @@ public:
 
 
 /*
-  QUICK_INDEX_MERGE_SELECT - index_merge access method quick select.
+  QUICK_INDEX_SORT_SELECT is the base class for the common functionality of:
+  - QUICK_INDEX_MERGE_SELECT, access based on multi-index merge/union 
+  - QUICK_INDEX_INTERSECT_SELECT, access based on  multi-index intersection 
+    
 
-    QUICK_INDEX_MERGE_SELECT uses
+    QUICK_INDEX_SORT_SELECT uses
      * QUICK_RANGE_SELECTs to get rows
-     * Unique class to remove duplicate rows
+     * Unique class
+       - to remove duplicate rows for QUICK_INDEX_MERGE_SELECT
+       - to intersect rows for QUICK_INDEX_INTERSECT_SELECT
 
   INDEX MERGE OPTIMIZER
-    Current implementation doesn't detect all cases where index_merge could
+    Current implementation doesn't detect all cases where index merge could
     be used, in particular:
-     * index_merge will never be used if range scan is possible (even if
-       range scan is more expensive)
 
-     * index_merge+'using index' is not supported (this the consequence of
-       the above restriction)
+     * index_merge+'using index' is not supported
 
      * If WHERE part contains complex nested AND and OR conditions, some ways
-       to retrieve rows using index_merge will not be considered. The choice
+       to retrieve rows using index merge will not be considered. The choice
        of read plan may depend on the order of conjuncts/disjuncts in WHERE
        part of the query, see comments near imerge_list_or_list and
        SEL_IMERGE::or_sel_tree_with_checks functions for details.
 
-     * There is no "index_merge_ref" method (but index_merge on non-first
+     * There is no "index_merge_ref" method (but index merge on non-first
        table in join is possible with 'range checked for each record').
 
-    See comments around SEL_IMERGE class and test_quick_select for more
-    details.
 
   ROW RETRIEVAL ALGORITHM
 
-    index_merge uses Unique class for duplicates removal.  index_merge takes
-    advantage of Clustered Primary Key (CPK) if the table has one.
-    The index_merge algorithm consists of two phases:
+    index merge/intersection uses Unique class for duplicates removal. 
+    index merge/intersection takes advantage of Clustered Primary Key (CPK)
+    if the table has one.
+    The index merge/intersection algorithm consists of two phases:
+
+    Phase 1 
+    (implemented by a QUICK_INDEX_MERGE_SELECT::read_keys_and_merge call):
 
-    Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique):
     prepare()
     {
       activate 'index only';
@@ -556,33 +577,32 @@ public:
       deactivate 'index only';
     }
 
-    Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next
-    calls):
+    Phase 2 
+    (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls):
 
     fetch()
     {
-      retrieve all rows from row pointers stored in Unique;
+      retrieve all rows from row pointers stored in Unique
+      (merging/intersecting them);
       free Unique;
-      retrieve all rows for CPK scan;
+      if (! intersection) 
+        retrieve all rows for CPK scan;
     }
 */
 
-class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I
+class QUICK_INDEX_SORT_SELECT : public QUICK_SELECT_I
 {
+protected:
   Unique *unique;
 public:
-  QUICK_INDEX_MERGE_SELECT(THD *thd, TABLE *table);
-  ~QUICK_INDEX_MERGE_SELECT();
+  QUICK_INDEX_SORT_SELECT(THD *thd, TABLE *table);
+  ~QUICK_INDEX_SORT_SELECT();
 
   int  init();
   void need_sorted_output() { DBUG_ASSERT(0); /* Can't do it */ }
   int  reset(void);
-  int  get_next();
   bool reverse_sorted() { return false; }
   bool unique_key_range() { return false; }
-  int get_type() { return QS_TYPE_INDEX_MERGE; }
-  void add_keys_and_lengths(String *key_names, String *used_lengths);
-  void add_info_string(String *str);
   bool is_keys_used(const MY_BITMAP *fields);
 #ifndef DBUG_OFF
   void dbug_dump(int indent, bool verbose);
@@ -590,21 +610,14 @@ public:
 
   bool push_quick_back(QUICK_RANGE_SELECT *quick_sel_range);
 
-  /* range quick selects this index_merge read consists of */
+  /* range quick selects this index merge/intersect consists of */
   List<QUICK_RANGE_SELECT> quick_selects;
 
   /* quick select that uses clustered primary key (NULL if none) */
   QUICK_RANGE_SELECT* pk_quick_select;
 
-  /* true if this select is currently doing a clustered PK scan */
-  bool  doing_pk_scan;
-
   MEM_ROOT alloc;
   THD *thd;
-  int read_keys_and_merge();
-
-  bool clustered_pk_range() { return test(pk_quick_select); }
-
   virtual bool is_valid()
   {
     List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
@@ -620,12 +633,48 @@ public:
     }
     return valid;
   }
-
+  virtual int read_keys_and_merge()= 0;
   /* used to get rows collected in Unique */
   READ_RECORD read_record;
 };
 
 
+
+class QUICK_INDEX_MERGE_SELECT : public QUICK_INDEX_SORT_SELECT
+{
+private:
+  /* true if this select is currently doing a clustered PK scan */
+  bool  doing_pk_scan;
+protected:
+  int read_keys_and_merge();
+
+public:
+  QUICK_INDEX_MERGE_SELECT(THD *thd, TABLE *table)
+    :QUICK_INDEX_SORT_SELECT(thd, table) {}
+
+  int get_next();
+  int get_type() { return QS_TYPE_INDEX_MERGE; }
+  void add_keys_and_lengths(String *key_names, String *used_lengths);
+  void add_info_string(String *str);
+};
+
+class QUICK_INDEX_INTERSECT_SELECT : public QUICK_INDEX_SORT_SELECT
+{
+protected:
+  int read_keys_and_merge();
+
+public:
+  QUICK_INDEX_INTERSECT_SELECT(THD *thd, TABLE *table)
+    :QUICK_INDEX_SORT_SELECT(thd, table) {}
+
+  key_map filtered_scans;
+  int get_next();
+  int get_type() { return QS_TYPE_INDEX_INTERSECT; }
+  void add_keys_and_lengths(String *key_names, String *used_lengths);
+  void add_info_string(String *str);
+};
+
+
 /*
   Rowid-Ordered Retrieval (ROR) index intersection quick select.
   This quick select produces intersection of row sequences returned
@@ -666,22 +715,30 @@ public:
   void dbug_dump(int indent, bool verbose);
 #endif
   int init_ror_merged_scan(bool reuse_handler);
-  bool push_quick_back(QUICK_RANGE_SELECT *quick_sel_range);
+  bool push_quick_back(MEM_ROOT *alloc, QUICK_RANGE_SELECT *quick_sel_range);
+
+  class QUICK_SELECT_WITH_RECORD : public Sql_alloc
+  {
+  public:
+    QUICK_RANGE_SELECT *quick;
+    uchar *key_tuple;
+    ~QUICK_SELECT_WITH_RECORD() { delete quick; }
+  };
 
   /*
     Range quick selects this intersection consists of, not including
     cpk_quick.
   */
-  List<QUICK_RANGE_SELECT> quick_selects;
+  List<QUICK_SELECT_WITH_RECORD> quick_selects;
 
   virtual bool is_valid()
   {
-    List_iterator_fast<QUICK_RANGE_SELECT> it(quick_selects);
-    QUICK_RANGE_SELECT *quick;
+    List_iterator_fast<QUICK_SELECT_WITH_RECORD> it(quick_selects);
+    QUICK_SELECT_WITH_RECORD *quick;
     bool valid= true;
     while ((quick= it++))
     {
-      if (!quick->is_valid())
+      if (!quick->quick->is_valid())
       {
         valid= false;
         break;
@@ -911,6 +968,13 @@ class SQL_SELECT :public Sql_alloc {
  public:
   QUICK_SELECT_I *quick;	// If quick-select used
   COND		*cond;		// where condition
+
+  /*
+    When using Index Condition Pushdown: condition that we've had before
+    extracting and pushing index condition.
+    In other cases, NULL.
+  */
+  Item *pre_idx_push_select_cond;
   TABLE	*head;
   IO_CACHE file;		// Positions to used records
   ha_rows records;		// Records in use if read from file
diff --git a/sql/opt_range_mrr.cc b/sql/opt_range_mrr.cc
index acf22bd7a49..160b783715c 100644
--- a/sql/opt_range_mrr.cc
+++ b/sql/opt_range_mrr.cc
@@ -116,11 +116,19 @@ static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree)
       - max_key_part
 
   RETURN
-    0  Ok
-    1  No more ranges in the sequence
+    FALSE  Ok
+    TRUE   No more ranges in the sequence
 */
 
-uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+#if (_MSC_FULL_VER == 160030319)
+/*
+   Workaround Visual Studio 2010 RTM compiler backend bug, the function enters 
+   infinite loop.
+ */
+#pragma optimize("g", off)
+#endif
+
+bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 {
   SEL_ARG *key_tree;
   SEL_ARG_RANGE_SEQ *seq= (SEL_ARG_RANGE_SEQ*)rseq;
@@ -217,7 +225,7 @@ walk_up_n_right:
   RANGE_SEQ_ENTRY *cur= &seq->stack[seq->i];
   uint min_key_length= cur->min_key - seq->param->min_key;
   
-  range->ptr= (char*)(int)(key_tree->part);
+  range->ptr= (char*)(intptr)(key_tree->part);
   if (cur->min_key_flag & GEOM_FLAG)
   {
     range->range_flag= cur->min_key_flag;
@@ -273,6 +281,12 @@ walk_up_n_right:
   return 0;
 }
 
+#if (_MSC_FULL_VER == 160030319)
+/* VS2010 compiler bug workaround */
+#pragma optimize("g", on)
+#endif
+
+
 /****************************************************************************
   MRR Range Sequence Interface implementation that walks array<QUICK_RANGE>
  ****************************************************************************/
@@ -314,7 +328,7 @@ range_seq_t quick_range_seq_init(void *init_param, uint n_ranges, uint flags)
     1  No more ranges in the sequence
 */
 
-uint quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+bool quick_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 {
   QUICK_RANGE_SEQ_CTX *ctx= (QUICK_RANGE_SEQ_CTX*)rseq;
 
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
index 38fc52c830d..c1fe8de51a4 100644
--- a/sql/opt_subselect.cc
+++ b/sql/opt_subselect.cc
@@ -2,7 +2,7 @@
   @file
 
   @brief
-    Subquery optimization code here.
+    Semi-join subquery optimizations code
 
 */
 
@@ -15,19 +15,178 @@
 #include "sql_test.h"
 #include <my_bit.h>
 
-// Our own:
+/*
+  This file contains optimizations for semi-join subqueries.
+  
+  Contents
+  --------
+  1. What is a semi-join subquery
+  2. General idea about semi-join execution
+  2.1 Correlated vs uncorrelated semi-joins
+  2.2 Mergeable vs non-mergeable semi-joins
+  3. Code-level view of semi-join processing
+  3.1 Conversion
+  3.1.1 Merged semi-join TABLE_LIST object
+  3.1.2 Non-merged semi-join data structure
+  3.2 Semi-joins and query optimization
+  3.2.1 Non-merged semi-joins and join optimization
+  3.2.2 Merged semi-joins and join optimization
+  3.3 Semi-joins and query execution
+
+  1. What is a semi-join subquery
+  -------------------------------
+  We use this definition of semi-join:
+
+    outer_tbl SEMI JOIN inner_tbl ON cond = {set of outer_tbl.row such that
+                                             exist inner_tbl.row, for which 
+                                             cond(outer_tbl.row,inner_tbl.row)
+                                             is satisfied}
+  
+  That is, semi-join operation is similar to inner join operation, with
+  exception that we don't care how many matches a row from outer_tbl has in
+  inner_tbl.
+
+  In SQL terms: a semi-join subquery is an IN subquery that is an AND-part of
+  the WHERE/ON clause.
+
+  2. General idea about semi-join execution
+  -----------------------------------------
+  We can execute semi-join in a way similar to inner join, with exception that
+  we need to somehow ensure that we do not generate record combinations that
+  differ only in rows of inner tables.
+  There is a number of different ways to achieve this property, implemented by
+  a number of semi-join execution strategies.
+  Some strategies can handle any semi-joins, other can be applied only to
+  semi-joins that have certain properties that are described below:
+
+  2.1 Correlated vs uncorrelated semi-joins
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Uncorrelated semi-joins are special in the respect that they allow to
+   - execute the subquery (possible as it's uncorrelated)
+   - somehow make sure that generated set does not have duplicates
+   - perform an inner join with outer tables.
+  
+  or, rephrasing in SQL form:
+
+  SELECT ... FROM ot WHERE ot.col IN (SELECT it.col FROM it WHERE uncorr_cond)
+    ->
+  SELECT ... FROM ot JOIN (SELECT DISTINCT it.col FROM it WHERE uncorr_cond)
+
+  2.2 Mergeable vs non-mergeable semi-joins
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Semi-join operation has some degree of commutability with inner join
+  operation: we can join subquery's tables with ouside table(s) and eliminate
+  duplicate record combination after that:
+
+    ot1 JOIN ot2 SEMI_JOIN{it1,it2} (it1 JOIN it2) ON sjcond(ot2,it*) ->
+              |
+              +-------------------------------+
+                                              v
+    ot1 SEMI_JOIN{it1,it2} (it1 JOIN it2 JOIN ot2) ON sjcond(ot2,it*)
+ 
+  In order for this to work, subquery's top-level operation must be join, and
+  grouping or ordering with limit (grouping or ordering with limit are not
+  commutative with duplicate removal). In other words, the conversion is
+  possible when the subquery doesn't have GROUP BY clause, any aggregate
+  functions*, or ORDER BY ... LIMIT clause.
+
+  Definitions:
+  - Subquery whose top-level operation is a join is called *mergeable semi-join*
+  - All other kinds of semi-join subqueries are considered non-mergeable.
+
+  *- this requirement is actually too strong, but its exceptions are too
+  complicated to be considered here.
+
+  3. Code-level view of semi-join processing
+  ------------------------------------------
+  
+  3.1 Conversion and pre-optimization data structures
+  ---------------------------------------------------
+  * When doing JOIN::prepare for the subquery, we detect that it can be
+    converted into a semi-join and register it in parent_join->sj_subselects
+
+  * At the start of parent_join->optimize(), the predicate is converted into 
+    a semi-join node. A semi-join node is a TABLE_LIST object that is linked
+    somewhere in parent_join->join_list (either it is just present there, or
+    it is a descendant of some of its members).
+  
+  There are two kinds of semi-joins:
+  - Merged semi-joins
+  - Non-merged semi-joins
+   
+  3.1.1 Merged semi-join TABLE_LIST object
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Merged semi-join object is a TABLE_LIST that contains a sub-join of 
+  subquery tables and the semi-join ON expression (in this respect it is 
+  very similar to nested outer join representation)
+  Merged semi-join represents this SQL:
+
+    ... SEMI JOIN (inner_tbl1 JOIN ... JOIN inner_tbl_n) ON sj_on_expr
+  
+  Semi-join objects of this kind have TABLE_LIST::sj_subq_pred set.
+ 
+  3.1.2 Non-merged semi-join data structure
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Non-merged semi-join object is a leaf TABLE_LIST object that has a subquery
+  that produces rows. It is similar to a base table and represents this SQL:
+    
+    ... SEMI_JOIN (SELECT non_mergeable_select) ON sj_on_expr
+  
+  Subquery items that were converted into semi-joins are removed from the WHERE
+  clause. (They do remain in PS-saved WHERE clause, and they replace themselves
+  with Item_int(1) on subsequent re-executions).
+
+  3.2 Semi-joins and join optimization
+  ------------------------------------
+  
+  3.2.1 Non-merged semi-joins and join optimization
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  For join optimization purposes, non-merged semi-join nests are similar to
+  base tables - they've got one JOIN_TAB, which can be accessed with one of
+  two methods:
+   - full table scan (representing SJ-Materialization-Scan strategy)
+   - eq_ref-like table lookup (representing SJ-Materialization-Lookup)
+
+  Unlike regular base tables, non-merged semi-joins have:
+   - non-zero JOIN_TAB::startup_cost, and
+   - join_tab->table->is_filled_at_execution()==TRUE, which means one
+     cannot do const table detection or range analysis or other table data-
+     dependent inferences
+  // instead, get_delayed_table_estimates() runs optimization on the nest so that 
+  // we get an idea about temptable size
+  
+  3.2.2 Merged semi-joins and join optimization
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   - optimize_semijoin_nests() does pre-optimization 
+   - during join optimization, the join has one JOIN_TAB (or is it POSITION?) 
+     array, and suffix-based detection is used, see advance_sj_state()
+   - after join optimization is done, get_best_combination() switches 
+     the data-structure to prefix-based, multiple JOIN_TAB ranges format.
+
+  3.3 Semi-joins and query execution
+  ----------------------------------
+  * Join executor has hooks for all semi-join strategies.
+    TODO elaborate.
+
+*/
+
+
 static
 bool subquery_types_allow_materialization(Item_in_subselect *in_subs);
 static bool replace_where_subcondition(JOIN *join, Item **expr, 
                                        Item *old_cond, Item *new_cond,
                                        bool do_fix_fields);
-static int subq_sj_candidate_cmp(Item_in_subselect* const *el1, 
-                                 Item_in_subselect* const *el2);
+static int subq_sj_candidate_cmp(Item_in_subselect* el1, Item_in_subselect* el2,
+                                 void *arg);
 static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred);
+static bool convert_subq_to_jtbm(JOIN *parent_join, 
+                                 Item_in_subselect *subq_pred, bool *remove);
 static TABLE_LIST *alloc_join_nest(THD *thd);
-static 
-void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist);
-static uint get_tmp_table_rec_length(List<Item> &items);
+static uint get_tmp_table_rec_length(Item **p_list, uint elements);
+static double get_tmp_table_lookup_cost(THD *thd, double row_count,
+                                        uint row_size);
+static double get_tmp_table_write_cost(THD *thd, double row_count,
+                                       uint row_size);
 bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables);
 static SJ_MATERIALIZATION_INFO *
 at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
@@ -45,27 +204,36 @@ static bool sj_table_is_included(JOIN *join, JOIN_TAB *join_tab);
 static Item *remove_additional_cond(Item* conds);
 static void remove_subq_pushed_predicates(JOIN *join, Item **where);
 
+enum_nested_loop_state 
+end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+
 
 /*
   Check if we need JOIN::prepare()-phase subquery rewrites and if yes, do them
 
+  SYNOPSIS
+     check_and_do_in_subquery_rewrites()
+       join  Subquery's join
+
   DESCRIPTION
     Check if we need to do
-     - subquery->semi-join rewrite
+     - subquery -> mergeable semi-join rewrite
      - if the subquery can be handled with materialization
      - 'substitution' rewrite for table-less subqueries like "(select 1)"
-
-    and mark appropriately
+     - IN->EXISTS rewrite
+    and, depending on the rewrite, either do it, or record it to be done at a
+    later phase.
 
   RETURN
-     0  - OK
-    -1  - Some sort of query error
+    0      - OK
+    Other  - Some sort of query error
 */
 
 int check_and_do_in_subquery_rewrites(JOIN *join)
 {
   THD *thd=join->thd;
   st_select_lex *select_lex= join->select_lex;
+  st_select_lex_unit* parent_unit= select_lex->master_unit();
   DBUG_ENTER("check_and_do_in_subquery_rewrites");
   /*
     If 
@@ -84,11 +252,22 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
   */
   Item_subselect *subselect;
   if (!(thd->lex->context_analysis_only & CONTEXT_ANALYSIS_ONLY_VIEW) && // (1)
-    (subselect= select_lex->master_unit()->item))      // (2)
+      (subselect= parent_unit->item))                                    // (2)
   {
     Item_in_subselect *in_subs= NULL;
-    if (subselect->substype() == Item_subselect::IN_SUBS)
-      in_subs= (Item_in_subselect*)subselect;
+    Item_allany_subselect *allany_subs= NULL;
+    switch (subselect->substype()) {
+    case Item_subselect::IN_SUBS:
+      in_subs= (Item_in_subselect *)subselect;
+      break;
+    case Item_subselect::ALL_SUBS:
+    case Item_subselect::ANY_SUBS:
+      allany_subs= (Item_allany_subselect *)subselect;
+      break;
+    default:
+      break;
+    }
+
 
     /* Resolve expressions and perform semantic analysis for IN query */
     if (in_subs != NULL)
@@ -128,6 +307,15 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
       if (failure)
         DBUG_RETURN(-1); /* purecov: deadcode */
     }
+    if (select_lex == parent_unit->fake_select_lex)
+    {
+      /*
+        The join and its select_lex object represent the 'fake' select used
+        to compute the result of a UNION.
+      */
+      DBUG_RETURN(0);
+    }
+
     DBUG_PRINT("info", ("Checking if subq can be converted to semi-join"));
     /*
       Check if we're in subquery that is a candidate for flattening into a
@@ -153,9 +341,9 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
         !join->having && !select_lex->with_sum_func &&                // 4
         thd->thd_marker.emb_on_expr_nest &&                           // 5
         select_lex->outer_select()->join &&                           // 6
-        select_lex->master_unit()->first_select()->leaf_tables &&     // 7
-        in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED && // 8
-        select_lex->outer_select()->leaf_tables &&                    // 9
+        parent_unit->first_select()->leaf_tables.elements &&          // 7
+        !in_subs->in_strategy &&                                      // 8
+        select_lex->outer_select()->leaf_tables.elements &&           // 9
         !((join->select_options |                                     // 10
            select_lex->outer_select()->join->select_options)          // 10
           & SELECT_STRAIGHT_JOIN))                                    // 10
@@ -165,72 +353,127 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
       (void)subquery_types_allow_materialization(in_subs);
 
       in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
+      in_subs->is_flattenable_semijoin= TRUE;
 
       /* Register the subquery for further processing in flatten_subqueries() */
-      select_lex->
-        outer_select()->join->sj_subselects.append(thd->mem_root, in_subs);
-      in_subs->expr_join_nest= thd->thd_marker.emb_on_expr_nest;
+      if (!in_subs->is_registered_semijoin)
+      {
+        Query_arena *arena, backup;
+        arena= thd->activate_stmt_arena_if_needed(&backup);
+        select_lex->outer_select()->sj_subselects.push_back(in_subs);
+        if (arena)
+          thd->restore_active_arena(arena, &backup);
+        in_subs->is_registered_semijoin= TRUE;
+      }
     }
     else
     {
-      DBUG_PRINT("info", ("Subquery can't be converted to semi-join"));
+      DBUG_PRINT("info", ("Subquery can't be converted to merged semi-join"));
+      /* Test if the user has set a legal combination of optimizer switches. */
+      if (!optimizer_flag(thd, OPTIMIZER_SWITCH_IN_TO_EXISTS) &&
+          !optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION))
+        my_error(ER_ILLEGAL_SUBQUERY_OPTIMIZER_SWITCHES, MYF(0));
+
       /*
-        Check if the subquery predicate can be executed via materialization.
-        The required conditions are:
-        1. Subquery predicate is an IN/=ANY subq predicate
-        2. Subquery is a single SELECT (not a UNION)
-        3. Subquery is not a table-less query. In this case there is no
-           point in materializing.
-          3A The upper query is not a table-less SELECT ... FROM DUAL. We
+        If the subquery predicate is IN/=ANY, analyse and set all possible
+        subquery execution strategies based on optimizer switches and syntactic
+        properties.
+      */
+      if (in_subs)
+      {
+        /*
+          Check if the subquery predicate can be executed via materialization.
+          The required conditions are:
+          0. The materialization optimizer switch was set.
+          1. Subquery is a single SELECT (not a UNION).
+             TODO: this is a limitation that can be fixed
+          2. Subquery is not a table-less query. In this case there is no
+             point in materializing.
+          2A The upper query is not a table-less SELECT ... FROM DUAL. We
              can't do materialization for SELECT .. FROM DUAL because it
              does not call setup_subquery_materialization(). We could make 
              SELECT ... FROM DUAL call that function but that doesn't seem
              to be the case that is worth handling.
-        4. Either the subquery predicate is a top-level predicate, or at
-           least one partial match strategy is enabled. If no partial match
-           strategy is enabled, then materialization cannot be used for
-           non-top-level queries because it cannot handle NULLs correctly.
-        5. Subquery is non-correlated
-           TODO:
-           This is an overly restrictive condition. It can be extended to:
-           (Subquery is non-correlated ||
-            Subquery is correlated to any query outer to IN predicate ||
-            (Subquery is correlated to the immediate outer query &&
-             Subquery !contains {GROUP BY, ORDER BY [LIMIT],
-             aggregate functions}) && subquery predicate is not under "NOT IN"))
-        6. No execution method was already chosen (by a prepared statement).
-
-        (*) The subquery must be part of a SELECT statement. The current
-             condition also excludes multi-table update statements.
-
-        Determine whether we will perform subquery materialization before
-        calling the IN=>EXISTS transformation, so that we know whether to
-        perform the whole transformation or only that part of it which wraps
-        Item_in_subselect in an Item_in_optimizer.
-      */
-      if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION)  && 
-          in_subs  &&                                                   // 1
-          !select_lex->is_part_of_union() &&                            // 2
-          select_lex->master_unit()->first_select()->leaf_tables &&     // 3
-          thd->lex->sql_command == SQLCOM_SELECT &&                     // *
-          select_lex->outer_select()->leaf_tables &&                    // 3A
-          subquery_types_allow_materialization(in_subs) &&
-          // psergey-todo: duplicated_subselect_card_check: where it's done?
-          (in_subs->is_top_level_item() ||
-           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) ||
-           optimizer_flag(thd, OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) &&//4
-          !in_subs->is_correlated &&                                  // 5
-          in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
-      {
-          in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
-      }
+          3. Either the subquery predicate is a top-level predicate, or at
+             least one partial match strategy is enabled. If no partial match
+             strategy is enabled, then materialization cannot be used for
+             non-top-level queries because it cannot handle NULLs correctly.
+          4. Subquery is non-correlated
+             TODO:
+             This condition is too restrictive (limitation). It can be extended to:
+             (Subquery is non-correlated ||
+              Subquery is correlated to any query outer to IN predicate ||
+              (Subquery is correlated to the immediate outer query &&
+               Subquery !contains {GROUP BY, ORDER BY [LIMIT],
+               aggregate functions}) && subquery predicate is not under "NOT IN"))
+
+          (*) The subquery must be part of a SELECT statement. The current
+               condition also excludes multi-table update statements.
+        A note about prepared statements: we want the if-branch to be taken on
+        PREPARE and each EXECUTE. The rewrites are only done once, but we need 
+        select_lex->sj_subselects list to be populated for every EXECUTE. 
 
-      Item_subselect::trans_res trans_res;
-      if ((trans_res= subselect->select_transformer(join)) !=
-          Item_subselect::RES_OK)
-      {
-        DBUG_RETURN((trans_res == Item_subselect::RES_ERROR));
+        */
+        if (optimizer_flag(thd, OPTIMIZER_SWITCH_MATERIALIZATION) &&      // 0
+            !select_lex->is_part_of_union() &&                            // 1
+            parent_unit->first_select()->leaf_tables.elements &&          // 2
+            thd->lex->sql_command == SQLCOM_SELECT &&                     // *
+            select_lex->outer_select()->leaf_tables.elements &&           // 2A
+            subquery_types_allow_materialization(in_subs) &&
+            (in_subs->is_top_level_item() ||                               //3
+             optimizer_flag(thd,
+                            OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE) || //3
+             optimizer_flag(thd,
+                            OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN)) && //3
+            !in_subs->is_correlated)                                       //4
+       {
+          in_subs->in_strategy|= SUBS_MATERIALIZATION;
+
+          /*
+            If the subquery is an AND-part of WHERE register for being processed
+            with jtbm strategy
+          */
+          if (thd->thd_marker.emb_on_expr_nest == NO_JOIN_NEST &&
+              optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN))
+          {
+            in_subs->emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
+            in_subs->is_flattenable_semijoin= FALSE;
+            if (!in_subs->is_registered_semijoin)
+	    {
+              Query_arena *arena, backup;
+              arena= thd->activate_stmt_arena_if_needed(&backup);
+              select_lex->outer_select()->sj_subselects.push_back(in_subs);
+              if (arena)
+                thd->restore_active_arena(arena, &backup);
+              in_subs->is_registered_semijoin= TRUE;
+            }
+          }
+        }
+
+        /*
+          IN-TO-EXISTS is the only universal strategy. Choose it if the user
+          allowed it via an optimizer switch, or if materialization is not
+          possible.
+        */
+        if (optimizer_flag(thd, OPTIMIZER_SWITCH_IN_TO_EXISTS) ||
+            !in_subs->in_strategy)
+        {
+          in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
+        }
       }
+
+      /* Check if max/min optimization applicable */
+      if (allany_subs)
+        allany_subs->in_strategy|= (allany_subs->is_maxmin_applicable(join) ?
+                                    (SUBS_MAXMIN_INJECTED | SUBS_MAXMIN_ENGINE) :
+                                    SUBS_IN_TO_EXISTS);
+
+      /*
+        Transform each subquery predicate according to its overloaded
+        transformer.
+      */
+      if (subselect->select_transformer(join))
+        DBUG_RETURN(-1);
     }
   }
   DBUG_RETURN(0);
@@ -307,29 +550,27 @@ bool subquery_types_allow_materialization(Item_in_subselect *in_subs)
     Item *inner= it++;
     all_are_fields &= (outer->real_item()->type() == Item::FIELD_ITEM && 
                        inner->real_item()->type() == Item::FIELD_ITEM);
-    if (outer->result_type() != inner->result_type())
+    if (outer->cmp_type() != inner->cmp_type())
       DBUG_RETURN(FALSE);
-    switch (outer->result_type()) {
+    switch (outer->cmp_type()) {
     case STRING_RESULT:
-      if (outer->is_datetime() != inner->is_datetime())
+      if (!(outer->collation.collation == inner->collation.collation))
         DBUG_RETURN(FALSE);
-
-      if (!(outer->collation.collation == inner->collation.collation 
-          /*&& outer->max_length <= inner->max_length */))
+      // Materialization does not work with BLOB columns
+      if (inner->field_type() == MYSQL_TYPE_BLOB || 
+          inner->field_type() == MYSQL_TYPE_GEOMETRY)
+        DBUG_RETURN(FALSE);
+      break;
+    case TIME_RESULT:
+      if (mysql_type_to_time_type(outer->field_type()) !=
+          mysql_type_to_time_type(outer->field_type()))
         DBUG_RETURN(FALSE);
-    /*case INT_RESULT:
-      if (!(outer->unsigned_flag ^ inner->unsigned_flag))
-        DBUG_RETURN(FALSE); */
     default:
-      ;/* suitable for materialization */
+      /* suitable for materialization */
+      break;
     }
-
-    // Materialization does not work with BLOB columns
-    if (inner->field_type() == MYSQL_TYPE_BLOB || 
-	inner->field_type() == MYSQL_TYPE_GEOMETRY)
-        DBUG_RETURN(FALSE);
   }
-    
+
   in_subs->types_allow_materialization= TRUE;
   in_subs->sjm_scan_allowed= all_are_fields;
   DBUG_PRINT("info",("subquery_types_allow_materialization: ok, allowed"));
@@ -337,6 +578,122 @@ bool subquery_types_allow_materialization(Item_in_subselect *in_subs)
 }
 
 
+/**
+  Apply max min optimization of all/any subselect
+*/
+
+bool JOIN::transform_max_min_subquery()
+{
+  DBUG_ENTER("JOIN::transform_max_min_subquery");
+  Item_subselect *subselect= unit->item;
+  if (!subselect || (subselect->substype() != Item_subselect::ALL_SUBS &&
+                     subselect->substype() != Item_subselect::ANY_SUBS))
+    DBUG_RETURN(0);
+  DBUG_RETURN(((Item_allany_subselect *) subselect)->
+              transform_into_max_min(this));
+}
+
+
+/*
+  Finalize IN->EXISTS conversion in case we couldn't use materialization.
+
+  DESCRIPTION  Invoke the IN->EXISTS converter
+    Replace the Item_in_subselect with its wrapper Item_in_optimizer in WHERE.
+
+  RETURN 
+    FALSE - Ok
+    TRUE  - Fatal error
+*/
+
+bool make_in_exists_conversion(THD *thd, JOIN *join, Item_in_subselect *item)
+{
+  DBUG_ENTER("make_in_exists_conversion");
+  JOIN *child_join= item->unit->first_select()->join;
+  bool res;
+
+  /* 
+    We're going to finalize IN->EXISTS conversion. 
+    Normally, IN->EXISTS conversion takes place inside the 
+    Item_subselect::fix_fields() call, where item_subselect->fixed==FALSE (as
+    fix_fields() haven't finished yet) and item_subselect->changed==FALSE (as 
+    the conversion haven't been finalized)
+
+    At the end of Item_subselect::fix_fields() we had to set fixed=TRUE,
+    changed=TRUE (the only other option would have been to return error).
+
+    So, now we have to set these back for the duration of select_transformer()
+    call.
+  */
+  item->changed= 0;
+  item->fixed= 0;
+
+  SELECT_LEX *save_select_lex= thd->lex->current_select;
+  thd->lex->current_select= item->unit->first_select();
+
+  res= item->select_transformer(child_join);
+
+  thd->lex->current_select= save_select_lex;
+
+  if (res)
+    DBUG_RETURN(TRUE);
+
+  item->changed= 1;
+  item->fixed= 1;
+
+  Item *substitute= item->substitution;
+  bool do_fix_fields= !item->substitution->fixed;
+  /*
+    The Item_subselect has already been wrapped with Item_in_optimizer, so we
+    should search for item->optimizer, not 'item'.
+  */
+  Item *replace_me= item->optimizer;
+  DBUG_ASSERT(replace_me==substitute);
+
+  Item **tree= (item->emb_on_expr_nest == NO_JOIN_NEST)?
+                 &join->conds : &(item->emb_on_expr_nest->on_expr);
+  if (replace_where_subcondition(join, tree, replace_me, substitute, 
+                                 do_fix_fields))
+    DBUG_RETURN(TRUE);
+  item->substitution= NULL;
+   
+    /*
+      If this is a prepared statement, repeat the above operation for
+      prep_where (or prep_on_expr). 
+    */
+  if (!thd->stmt_arena->is_conventional())
+  {
+    tree= (item->emb_on_expr_nest == (TABLE_LIST*)NO_JOIN_NEST)?
+           &join->select_lex->prep_where : 
+           &(item->emb_on_expr_nest->prep_on_expr);
+
+    if (replace_where_subcondition(join, tree, replace_me, substitute, 
+                                   FALSE))
+      DBUG_RETURN(TRUE);
+  }
+  DBUG_RETURN(FALSE);
+}
+
+
+bool check_for_outer_joins(List<TABLE_LIST> *join_list)
+{
+  TABLE_LIST *table;
+  NESTED_JOIN *nested_join;
+  List_iterator<TABLE_LIST> li(*join_list);
+  while ((table= li++))
+  {
+    if ((nested_join= table->nested_join))
+    {
+      if (check_for_outer_joins(&nested_join->join_list))
+        return TRUE;
+    }
+    
+    if (table->outer_join)
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
 /*
   Convert semi-join subquery predicates into semi-join join nests
 
@@ -387,23 +744,33 @@ bool subquery_types_allow_materialization(Item_in_subselect *in_subs)
 bool convert_join_subqueries_to_semijoins(JOIN *join)
 {
   Query_arena *arena, backup;
-  Item_in_subselect **in_subq;
-  Item_in_subselect **in_subq_end;
+  Item_in_subselect *in_subq;
   THD *thd= join->thd;
+  List_iterator<TABLE_LIST> ti(join->select_lex->leaf_tables);
   DBUG_ENTER("convert_join_subqueries_to_semijoins");
 
-  if (join->sj_subselects.elements() == 0)
+  if (join->select_lex->sj_subselects.is_empty())
     DBUG_RETURN(FALSE);
 
+  List_iterator_fast<Item_in_subselect> li(join->select_lex->sj_subselects);
+
+  while ((in_subq= li++))
+  {
+    SELECT_LEX *subq_sel= in_subq->get_select_lex();
+    if (subq_sel->handle_derived(thd->lex, DT_OPTIMIZE))
+      DBUG_RETURN(1);
+    if (subq_sel->handle_derived(thd->lex, DT_MERGE))
+      DBUG_RETURN(TRUE);
+    subq_sel->update_used_tables();
+  }
+
+  li.rewind();
   /* First, convert child join's subqueries. We proceed bottom-up here */
-  for (in_subq= join->sj_subselects.front(), 
-       in_subq_end= join->sj_subselects.back(); 
-       in_subq != in_subq_end; 
-       in_subq++)
+  while ((in_subq= li++)) 
   {
-    st_select_lex *child_select= (*in_subq)->get_select_lex();
+    st_select_lex *child_select= in_subq->get_select_lex();
     JOIN *child_join= child_select->join;
-    child_join->outer_tables = child_join->tables;
+    child_join->outer_tables = child_join->table_count;
 
     /*
       child_select->where contains only the WHERE predicate of the
@@ -415,24 +782,21 @@ bool convert_join_subqueries_to_semijoins(JOIN *join)
 
     if (convert_join_subqueries_to_semijoins(child_join))
       DBUG_RETURN(TRUE);
-    (*in_subq)->sj_convert_priority= 
-      (*in_subq)->is_correlated * MAX_TABLES + child_join->outer_tables;
+    in_subq->sj_convert_priority= 
+      test(in_subq->emb_on_expr_nest != NO_JOIN_NEST) * MAX_TABLES * 2 +
+      in_subq->is_correlated * MAX_TABLES + child_join->outer_tables;
   }
   
   // Temporary measure: disable semi-joins when they are together with outer
   // joins.
-  for (TABLE_LIST *tbl= join->select_lex->leaf_tables; tbl; tbl=tbl->next_leaf)
+#if 0  
+  if (check_for_outer_joins(join->join_list))
   {
-    TABLE_LIST *embedding= tbl->embedding;
-    if (tbl->on_expr || (tbl->embedding && !(embedding->sj_on_expr && 
-                                            !embedding->embedding)))
-    {
-      in_subq= join->sj_subselects.front();
-      arena= thd->activate_stmt_arena_if_needed(&backup);
-      goto skip_conversion;
-    }
+    in_subq= join->select_lex->sj_subselects.head();
+    arena= thd->activate_stmt_arena_if_needed(&backup);
+    goto skip_conversion;
   }
-
+#endif
   //dump_TABLE_LIST_struct(select_lex, select_lex->leaf_tables);
   /* 
     2. Pick which subqueries to convert:
@@ -440,82 +804,165 @@ bool convert_join_subqueries_to_semijoins(JOIN *join)
       - prefer correlated subqueries over uncorrelated;
       - prefer subqueries that have greater number of outer tables;
   */
-  join->sj_subselects.sort(subq_sj_candidate_cmp);
+  bubble_sort<Item_in_subselect>(&join->select_lex->sj_subselects,
+				 subq_sj_candidate_cmp, NULL);
   // #tables-in-parent-query + #tables-in-subquery < MAX_TABLES
   /* Replace all subqueries to be flattened with Item_int(1) */
   arena= thd->activate_stmt_arena_if_needed(&backup);
-  for (in_subq= join->sj_subselects.front(); 
-       in_subq != in_subq_end && 
-       join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
-       in_subq++)
-  {
-    Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
-                   &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
-    if (replace_where_subcondition(join, tree, *in_subq, new Item_int(1),
-                                   FALSE))
-      DBUG_RETURN(TRUE); /* purecov: inspected */
-  }
  
-  for (in_subq= join->sj_subselects.front(); 
-       in_subq != in_subq_end && 
-       join->tables + (*in_subq)->unit->first_select()->join->tables < MAX_TABLES;
-       in_subq++)
+  li.rewind();
+  while ((in_subq= li++))
   {
-    if (convert_subq_to_sj(join, *in_subq))
-      DBUG_RETURN(TRUE);
+    bool remove_item= TRUE;
+
+    /* Stop processing if we've reached a subquery that's attached to the ON clause */
+    if (in_subq->emb_on_expr_nest != NO_JOIN_NEST)
+      break;
+
+    if (in_subq->is_flattenable_semijoin) 
+    {
+      if (join->table_count + 
+          in_subq->unit->first_select()->join->table_count >= MAX_TABLES)
+        break;
+      if (convert_subq_to_sj(join, in_subq))
+        DBUG_RETURN(TRUE);
+    }
+    else
+    {
+      if (join->table_count + 1 >= MAX_TABLES)
+        break;
+      if (convert_subq_to_jtbm(join, in_subq, &remove_item))
+        DBUG_RETURN(TRUE);
+    }
+    if (remove_item)
+    {
+      Item **tree= (in_subq->emb_on_expr_nest == NO_JOIN_NEST)?
+                     &join->conds : &(in_subq->emb_on_expr_nest->on_expr);
+      Item *replace_me= in_subq->original_item();
+      if (replace_where_subcondition(join, tree, replace_me, new Item_int(1),
+                                     FALSE))
+        DBUG_RETURN(TRUE); /* purecov: inspected */
+    }
   }
-skip_conversion:
+//skip_conversion:
   /* 
     3. Finalize (perform IN->EXISTS rewrite) the subqueries that we didn't
     convert:
   */
-  for (; in_subq!= in_subq_end; in_subq++)
+  while (in_subq)
   {
-    JOIN *child_join= (*in_subq)->unit->first_select()->join;
-    Item_subselect::trans_res res;
-    (*in_subq)->changed= 0;
-    (*in_subq)->fixed= 0;
+    JOIN *child_join= in_subq->unit->first_select()->join;
+    in_subq->changed= 0;
+    in_subq->fixed= 0;
 
     SELECT_LEX *save_select_lex= thd->lex->current_select;
-    thd->lex->current_select= (*in_subq)->unit->first_select();
+    thd->lex->current_select= in_subq->unit->first_select();
 
-    res= (*in_subq)->select_transformer(child_join);
+    bool res= in_subq->select_transformer(child_join);
 
     thd->lex->current_select= save_select_lex;
 
-    if (res == Item_subselect::RES_ERROR)
+    if (res)
       DBUG_RETURN(TRUE);
 
-    (*in_subq)->changed= 1;
-    (*in_subq)->fixed= 1;
+    in_subq->changed= 1;
+    in_subq->fixed= 1;
 
-    Item *substitute= (*in_subq)->substitution;
-    bool do_fix_fields= !(*in_subq)->substitution->fixed;
-    Item **tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
-                   &join->conds : &((*in_subq)->emb_on_expr_nest->on_expr);
-    if (replace_where_subcondition(join, tree, *in_subq, substitute, 
+    Item *substitute= in_subq->substitution;
+    bool do_fix_fields= !in_subq->substitution->fixed;
+    Item **tree= (in_subq->emb_on_expr_nest == NO_JOIN_NEST)?
+                   &join->conds : &(in_subq->emb_on_expr_nest->on_expr);
+    Item *replace_me= in_subq->original_item();
+    if (replace_where_subcondition(join, tree, replace_me, substitute, 
                                    do_fix_fields))
       DBUG_RETURN(TRUE);
-    (*in_subq)->substitution= NULL;
-     
+    in_subq->substitution= NULL;
+#if 0
+    /* 
+      Don't do the following, because the simplify_join() call is after this
+      call, and that call will save to prep_wher/prep_on_expr.
+    */
+
+    /*
+      If this is a prepared statement, repeat the above operation for
+      prep_where (or prep_on_expr). Subquery-to-semijoin conversion is 
+      done once for prepared statement.
+    */
     if (!thd->stmt_arena->is_conventional())
     {
-      tree= ((*in_subq)->emb_on_expr_nest == (TABLE_LIST*)1)?
+      tree= (in_subq->emb_on_expr_nest == NO_JOIN_NEST)?
              &join->select_lex->prep_where : 
-             &((*in_subq)->emb_on_expr_nest->prep_on_expr);
+             &(in_subq->emb_on_expr_nest->prep_on_expr);
 
-      if (replace_where_subcondition(join, tree, *in_subq, substitute, 
+      if (replace_where_subcondition(join, tree, replace_me, substitute, 
                                      FALSE))
         DBUG_RETURN(TRUE);
     }
+#endif
+    /*
+      Revert to the IN->EXISTS strategy in the rare case when the subquery could
+      not be flattened.
+      TODO: This is a limitation done for simplicity. Such subqueries could also
+      be executed via materialization. In order to determine this, we should
+      re-run the test for materialization that was done in
+      check_and_do_in_subquery_rewrites.
+    */
+    in_subq->in_strategy= SUBS_IN_TO_EXISTS;
+    in_subq= li++;
   }
 
   if (arena)
     thd->restore_active_arena(arena, &backup);
-  join->sj_subselects.clear();
+  join->select_lex->sj_subselects.empty();
   DBUG_RETURN(FALSE);
 }
 
+
+/*
+  Get #output_rows and scan_time estimates for a "delayed" table.
+
+  SYNOPSIS
+    get_delayed_table_estimates()
+      table         IN    Table to get estimates for
+      out_rows      OUT   E(#rows in the table)
+      scan_time     OUT   E(scan_time).
+      startup_cost  OUT   cost to populate the table.
+
+  DESCRIPTION
+    Get #output_rows and scan_time estimates for a "delayed" table. By
+    "delayed" here we mean that the table is filled at the start of query
+    execution. This means that the optimizer can't use table statistics to 
+    get #rows estimate for it, it has to call this function instead.
+
+    This function is expected to make different actions depending on the nature
+    of the table. At the moment there is only one kind of delayed tables,
+    non-flattenable semi-joins.
+*/
+
+void get_delayed_table_estimates(TABLE *table,
+                                 ha_rows *out_rows, 
+                                 double *scan_time,
+                                 double *startup_cost)
+{
+  Item_in_subselect *item= table->pos_in_table_list->jtbm_subselect;
+
+  DBUG_ASSERT(item->engine->engine_type() ==
+              subselect_engine::HASH_SJ_ENGINE);
+
+  subselect_hash_sj_engine *hash_sj_engine=
+    ((subselect_hash_sj_engine*)item->engine);
+
+  *out_rows= (ha_rows)item->jtbm_record_count;
+  *startup_cost= item->jtbm_read_time;
+
+  /* Calculate cost of scanning the temptable */
+  double data_size= item->jtbm_record_count * 
+                    hash_sj_engine->tmp_table->s->reclength;
+  /* Do like in handler::read_time */
+  *scan_time= data_size/IO_SIZE + 2;
+} 
+
+
 /**
    @brief Replaces an expression destructively inside the expression tree of
    the WHERE clase.
@@ -533,6 +980,7 @@ skip_conversion:
    @return <code>true</code> if there was an error, <code>false</code> if
    successful.
 */
+
 static bool replace_where_subcondition(JOIN *join, Item **expr, 
                                        Item *old_cond, Item *new_cond,
                                        bool do_fix_fields)
@@ -566,11 +1014,11 @@ static bool replace_where_subcondition(JOIN *join, Item **expr,
   return TRUE;
 }
 
-static int subq_sj_candidate_cmp(Item_in_subselect* const *el1, 
-                                 Item_in_subselect* const *el2)
+static int subq_sj_candidate_cmp(Item_in_subselect* el1, Item_in_subselect* el2,
+                                 void *arg)
 {
-  return ((*el1)->sj_convert_priority < (*el2)->sj_convert_priority) ? 1 : 
-         ( ((*el1)->sj_convert_priority == (*el2)->sj_convert_priority)? 0 : -1);
+  return (el1->sj_convert_priority > el2->sj_convert_priority) ? 1 : 
+         ( (el1->sj_convert_priority == el2->sj_convert_priority)? 0 : -1);
 }
 
 
@@ -614,9 +1062,9 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
     1. Find out where to put the predicate into.
      Note: for "t1 LEFT JOIN t2" this will be t2, a leaf.
   */
-  if ((void*)subq_pred->expr_join_nest != (void*)1)
+  if ((void*)subq_pred->emb_on_expr_nest != (void*)NO_JOIN_NEST)
   {
-    if (subq_pred->expr_join_nest->nested_join)
+    if (subq_pred->emb_on_expr_nest->nested_join)
     {
       /*
         We're dealing with
@@ -625,10 +1073,10 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
 
         The sj-nest will be inserted into the brackets nest.
       */
-      emb_tbl_nest=  subq_pred->expr_join_nest;
+      emb_tbl_nest=  subq_pred->emb_on_expr_nest;
       emb_join_list= &emb_tbl_nest->nested_join->join_list;
     }
-    else if (!subq_pred->expr_join_nest->outer_join)
+    else if (!subq_pred->emb_on_expr_nest->outer_join)
     {
       /*
         We're dealing with
@@ -638,13 +1086,13 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
         The sj-nest will be tblX's "sibling", i.e. another child of its
         parent. This is ok because tblX is joined as an inner join.
       */
-      emb_tbl_nest= subq_pred->expr_join_nest->embedding;
+      emb_tbl_nest= subq_pred->emb_on_expr_nest->embedding;
       if (emb_tbl_nest)
         emb_join_list= &emb_tbl_nest->nested_join->join_list;
     }
-    else if (!subq_pred->expr_join_nest->nested_join)
+    else if (!subq_pred->emb_on_expr_nest->nested_join)
     {
-      TABLE_LIST *outer_tbl= subq_pred->expr_join_nest;      
+      TABLE_LIST *outer_tbl= subq_pred->emb_on_expr_nest;
       TABLE_LIST *wrap_nest;
       /*
         We're dealing with
@@ -736,7 +1184,7 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
   st_select_lex *subq_lex= subq_pred->unit->first_select();
   nested_join->join_list.empty();
   List_iterator_fast<TABLE_LIST> li(subq_lex->top_join_list);
-  TABLE_LIST *tl, *last_leaf;
+  TABLE_LIST *tl;
   while ((tl= li++))
   {
     tl->embedding= sj_nest;
@@ -751,42 +1199,44 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
     NOTE: We actually insert them at the front! That's because the order is
           reversed in this list.
   */
-  for (tl= parent_lex->leaf_tables; tl->next_leaf; tl= tl->next_leaf) ;
-  tl->next_leaf= subq_lex->leaf_tables;
-  last_leaf= tl;
+  parent_lex->leaf_tables.concat(&subq_lex->leaf_tables);
 
   /*
     Same as above for next_local chain
     (a theory: a next_local chain always starts with ::leaf_tables
      because view's tables are inserted after the view)
   */
-  for (tl= parent_lex->leaf_tables; tl->next_local; tl= tl->next_local) ;
-  tl->next_local= subq_lex->leaf_tables;
+  for (tl= parent_lex->leaf_tables.head(); tl->next_local; tl= tl->next_local) ;
+  tl->next_local= subq_lex->leaf_tables.head();
 
   /* A theory: no need to re-connect the next_global chain */
 
   /* 3. Remove the original subquery predicate from the WHERE/ON */
 
   // The subqueries were replaced for Item_int(1) earlier
-  subq_pred->exec_method=
-    Item_in_subselect::SEMI_JOIN;         // for subsequent executions
+  subq_pred->in_strategy= SUBS_SEMI_JOIN;         // for subsequent executions
   /*TODO: also reset the 'with_subselect' there. */
 
-  /* n. Adjust the parent_join->tables counter */
-  uint table_no= parent_join->tables;
+  /* n. Adjust the parent_join->table_count counter */
+  uint table_no= parent_join->table_count;
   /* n. Walk through child's tables and adjust table->map */
-  for (tl= subq_lex->leaf_tables; tl; tl= tl->next_leaf, table_no++)
+  List_iterator_fast<TABLE_LIST> si(subq_lex->leaf_tables);
+  while ((tl= si++))
   {
     tl->table->tablenr= table_no;
     tl->table->map= ((table_map)1) << table_no;
+    if (tl->is_jtbm())
+      tl->jtbm_table_no= tl->table->tablenr;
     SELECT_LEX *old_sl= tl->select_lex;
     tl->select_lex= parent_join->select_lex; 
     for (TABLE_LIST *emb= tl->embedding;
          emb && emb->select_lex == old_sl;
          emb= emb->embedding)
       emb->select_lex= parent_join->select_lex;
+    table_no++;
   }
-  parent_join->tables += subq_lex->join->tables;
+  parent_join->table_count += subq_lex->join->table_count;
+  //parent_join->table_count += subq_lex->leaf_tables.elements;
 
   /* 
     Put the subquery's WHERE into semi-join's sj_on_expr
@@ -844,7 +1294,8 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
     }
   }
   /* Fix the created equality and AND */
-  sj_nest->sj_on_expr->fix_fields(parent_join->thd, &sj_nest->sj_on_expr);
+  if (!sj_nest->sj_on_expr->fixed)
+    sj_nest->sj_on_expr->fix_fields(parent_join->thd, &sj_nest->sj_on_expr);
 
   /*
     Walk through sj nest's WHERE and ON expressions and call
@@ -865,13 +1316,25 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
   {
     emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr, 
                                      sj_nest->sj_on_expr);
-    emb_tbl_nest->on_expr->fix_fields(parent_join->thd, &emb_tbl_nest->on_expr);
+    emb_tbl_nest->on_expr->top_level_item();
+    if (!emb_tbl_nest->on_expr->fixed)
+      emb_tbl_nest->on_expr->fix_fields(parent_join->thd,
+                                        &emb_tbl_nest->on_expr);
   }
   else
   {
     /* Inject into the WHERE */
     parent_join->conds= and_items(parent_join->conds, sj_nest->sj_on_expr);
-    parent_join->conds->fix_fields(parent_join->thd, &parent_join->conds);
+    parent_join->conds->top_level_item();
+    /*
+      fix_fields must update the properties (e.g. st_select_lex::cond_count of
+      the correct select_lex.
+    */
+    save_lex= thd->lex->current_select;
+    thd->lex->current_select=parent_join->select_lex;
+    if (!parent_join->conds->fixed)
+      parent_join->conds->fix_fields(parent_join->thd, &parent_join->conds);
+    thd->lex->current_select=save_lex;
     parent_join->select_lex->where= parent_join->conds;
   }
 
@@ -886,6 +1349,138 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred)
   DBUG_RETURN(FALSE);
 }
 
+
+const int SUBQERY_TEMPTABLE_NAME_MAX_LEN= 20;
+
+static void create_subquery_temptable_name(char *to, uint number)
+{
+  DBUG_ASSERT(number < 10000);       
+  to= strmov(to, "<subquery");
+  to= int10_to_str((int) number, to, 10);
+  to[0]= '>';
+  to[1]= 0;
+}
+
+
+/*
+  Convert subquery predicate into non-mergeable semi-join nest.
+
+  TODO: 
+    why does this do IN-EXISTS conversion? Can't we unify it with mergeable
+    semi-joins? currently, convert_subq_to_sj() cannot fail to convert (unless
+    fatal errors)
+
+    
+  RETURN 
+    FALSE - Ok
+    TRUE  - Fatal error
+*/
+
+static bool convert_subq_to_jtbm(JOIN *parent_join, 
+                                 Item_in_subselect *subq_pred, 
+                                 bool *remove_item)
+{
+  SELECT_LEX *parent_lex= parent_join->select_lex;
+  List<TABLE_LIST> *emb_join_list= &parent_lex->top_join_list;
+  TABLE_LIST *emb_tbl_nest= NULL; // will change when we learn to handle outer joins
+  TABLE_LIST *tl;
+  double rows;
+  double read_time;
+  DBUG_ENTER("convert_subq_to_jtbm");
+
+  subq_pred->in_strategy &= ~SUBS_IN_TO_EXISTS;
+  subq_pred->optimize(&rows, &read_time);
+
+  subq_pred->jtbm_read_time= read_time;
+  subq_pred->jtbm_record_count=rows;
+  subq_pred->is_jtbm_merged= TRUE;
+
+  if (subq_pred->engine->engine_type() != subselect_engine::HASH_SJ_ENGINE)
+  {
+    *remove_item= FALSE;
+    DBUG_RETURN(FALSE);
+  }
+
+
+  *remove_item= TRUE;
+
+  TABLE_LIST *jtbm;
+  char *tbl_alias;
+  if (!(tbl_alias= (char*)parent_join->thd->calloc(SUBQERY_TEMPTABLE_NAME_MAX_LEN)) ||
+      !(jtbm= alloc_join_nest(parent_join->thd))) //todo: this is not a join nest!
+  {
+    DBUG_RETURN(TRUE);
+  }
+
+  jtbm->join_list= emb_join_list;
+  jtbm->embedding= emb_tbl_nest;
+  jtbm->jtbm_subselect= subq_pred;
+  jtbm->nested_join= NULL;
+
+  /* Nests do not participate in those 'chains', so: */
+  /* jtbm->next_leaf= jtbm->next_local= jtbm->next_global == NULL*/
+  emb_join_list->push_back(jtbm);
+  
+  /* 
+    Inject the jtbm table into TABLE_LIST::next_leaf list, so that 
+    make_join_statistics() and co. can find it.
+  */
+  parent_lex->leaf_tables.push_back(jtbm);
+
+  /*
+    Same as above for TABLE_LIST::next_local chain
+    (a theory: a next_local chain always starts with ::leaf_tables
+     because view's tables are inserted after the view)
+  */
+  for (tl= parent_lex->leaf_tables.head(); tl->next_local; tl= tl->next_local)
+  {}
+  tl->next_local= jtbm;
+
+  /* A theory: no need to re-connect the next_global chain */
+
+  subselect_hash_sj_engine *hash_sj_engine=
+    ((subselect_hash_sj_engine*)subq_pred->engine);
+  jtbm->table= hash_sj_engine->tmp_table;
+
+  jtbm->table->tablenr= parent_join->table_count;
+  jtbm->table->map= table_map(1) << (parent_join->table_count);
+  jtbm->jtbm_table_no= jtbm->table->tablenr;
+
+  parent_join->table_count++;
+  DBUG_ASSERT(parent_join->table_count < MAX_TABLES);
+
+  Item *conds= hash_sj_engine->semi_join_conds;
+  conds->fix_after_pullout(parent_lex, &conds);
+
+  DBUG_EXECUTE("where", print_where(conds,"SJ-EXPR", QT_ORDINARY););
+  
+  create_subquery_temptable_name(tbl_alias, hash_sj_engine->materialize_join->
+                                              select_lex->select_number);
+  jtbm->alias= tbl_alias;
+#if 0
+  /* Inject sj_on_expr into the parent's WHERE or ON */
+  if (emb_tbl_nest)
+  {
+    DBUG_ASSERT(0);
+    /*emb_tbl_nest->on_expr= and_items(emb_tbl_nest->on_expr, 
+                                     sj_nest->sj_on_expr);
+    emb_tbl_nest->on_expr->fix_fields(parent_join->thd, &emb_tbl_nest->on_expr);
+    */
+  }
+  else
+  {
+    /* Inject into the WHERE */
+    parent_join->conds= and_items(parent_join->conds, conds);
+    parent_join->conds->fix_fields(parent_join->thd, &parent_join->conds);
+    parent_join->select_lex->where= parent_join->conds;
+  }
+#endif
+  /* Don't unlink the child subselect, as the subquery will be used. */
+
+  DBUG_RETURN(FALSE);
+}
+
+
 static TABLE_LIST *alloc_join_nest(THD *thd)
 {
   TABLE_LIST *tbl;
@@ -898,7 +1493,6 @@ static TABLE_LIST *alloc_join_nest(THD *thd)
 }
 
 
-static
 void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist)
 {
   List_iterator<TABLE_LIST> it(*tlist);
@@ -913,6 +1507,25 @@ void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist)
 }
 
 
+static void set_emb_join_nest(List<TABLE_LIST> *tables, TABLE_LIST *emb_sj_nest)
+{
+  List_iterator<TABLE_LIST> it(*tables);
+  TABLE_LIST *tbl;
+  while ((tbl= it++))
+  {
+    /*
+      Note: check for nested_join first. 
+       derived-merged tables have tbl->table!=NULL &&
+       tbl->table->reginfo==NULL.
+    */
+    if (tbl->nested_join)
+      set_emb_join_nest(&tbl->nested_join->join_list, emb_sj_nest);
+    else if (tbl->table)
+      tbl->table->reginfo.join_tab->emb_sj_nest= emb_sj_nest;
+
+  }
+}
+
 /*
   Pull tables out of semi-join nests, if possible
 
@@ -968,10 +1581,34 @@ int pull_out_semijoin_tables(JOIN *join)
   /* Try pulling out of the each of the semi-joins */
   while ((sj_nest= sj_list_it++))
   {
-    /* Action #1: Mark the constant tables to be pulled out */
-    table_map pulled_tables= 0;
     List_iterator<TABLE_LIST> child_li(sj_nest->nested_join->join_list);
     TABLE_LIST *tbl;
+
+    /*
+      Don't do table pull-out for nested joins (if we get nested joins here, it
+      means these are outer joins. It is theoretically possible to do pull-out
+      for some of the outer tables but we dont support this currently.
+    */
+    bool have_join_nest_children= FALSE;
+
+    set_emb_join_nest(&sj_nest->nested_join->join_list, sj_nest);
+
+    while ((tbl= child_li++))
+    {
+      if (tbl->nested_join)
+      {
+        have_join_nest_children= TRUE;
+        break;
+      }
+    }
+    
+    
+    table_map pulled_tables= 0;
+    if (have_join_nest_children)
+      goto skip;
+
+    /* Action #1: Mark the constant tables to be pulled out */
+    child_li.rewind();
     while ((tbl= child_li++))
     {
       if (tbl->table)
@@ -1029,7 +1666,7 @@ int pull_out_semijoin_tables(JOIN *join)
             pulled_a_table= TRUE;
             pulled_tables |= tbl->table->map;
             DBUG_PRINT("info", ("Table %s pulled out (reason: func dep)",
-                                tbl->table->alias));
+                                tbl->table->alias.c_ptr()));
             /*
               Pulling a table out of uncorrelated subquery in general makes
               makes it correlated. See the NOTE to this funtion. 
@@ -1043,6 +1680,7 @@ int pull_out_semijoin_tables(JOIN *join)
     } while (pulled_a_table);
  
     child_li.rewind();
+  skip:
     /*
       Action #3: Move the pulled out TABLE_LIST elements to the parents.
     */
@@ -1151,7 +1789,7 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
            sj_nest->sj_subq_pred->types_allow_materialization)
       {
         join->emb_sjm_nest= sj_nest;
-        if (choose_plan(join, all_table_map))
+        if (choose_plan(join, all_table_map &~join->const_table_map))
           DBUG_RETURN(TRUE); /* purecov: inspected */
         /*
           The best plan to run the subquery is now in join->best_positions,
@@ -1166,14 +1804,25 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
         sjm->tables= n_tables;
         sjm->is_used= FALSE;
         double subjoin_out_rows, subjoin_read_time;
-        get_partial_join_cost(join, n_tables,
-                              &subjoin_read_time, &subjoin_out_rows);
+
+        /*
+        join->get_partial_cost_and_fanout(n_tables + join->const_tables,
+                                          table_map(-1),
+                                          &subjoin_read_time, 
+                                          &subjoin_out_rows);
+        */
+        join->get_prefix_cost_and_fanout(n_tables, 
+                                         &subjoin_read_time,
+                                         &subjoin_out_rows);
 
         sjm->materialization_cost.convert_from_cost(subjoin_read_time);
         sjm->rows= subjoin_out_rows;
-
-        List<Item> &right_expr_list= 
-          sj_nest->sj_subq_pred->unit->first_select()->item_list;
+        
+        // Don't use the following list because it has "stale" items. use
+        // ref_pointer_array instead:
+        //
+        //List<Item> &right_expr_list= 
+        //  sj_nest->sj_subq_pred->unit->first_select()->item_list;
         /*
           Adjust output cardinality estimates. If the subquery has form
 
@@ -1188,18 +1837,23 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
                 "oe IN (SELECT t.key ...)" it is trivial. 
               - Functional dependencies between the tables in the semi-join
                 nest (the payoff is probably less here?)
+          
+          See also get_post_group_estimate().
         */
+        SELECT_LEX *subq_select= sj_nest->sj_subq_pred->unit->first_select();
         {
           for (uint i=0 ; i < join->const_tables + sjm->tables ; i++)
           {
             JOIN_TAB *tab= join->best_positions[i].table;
             join->map2table[tab->table->tablenr]= tab;
           }
-          List_iterator<Item> it(right_expr_list);
-          Item *item;
+          //List_iterator<Item> it(right_expr_list);
+          Item **ref_array= subq_select->ref_pointer_array;
+          Item **ref_array_end= ref_array + subq_select->item_list.elements; 
           table_map map= 0;
-          while ((item= it++))
-            map |= item->used_tables();
+          //while ((item= it++))
+          for (;ref_array < ref_array_end; ref_array++)
+            map |= (*ref_array)->used_tables();
           map= map & ~PSEUDO_TABLE_BITS;
           Table_map_iterator tm_it(map);
           int tableno;
@@ -1214,18 +1868,18 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
         /*
           Calculate temporary table parameters and usage costs
         */
-        uint rowlen= get_tmp_table_rec_length(right_expr_list);
-        double lookup_cost;
-        if (rowlen * subjoin_out_rows< join->thd->variables.max_heap_table_size)
-          lookup_cost= HEAP_TEMPTABLE_LOOKUP_COST;
-        else
-          lookup_cost= DISK_TEMPTABLE_LOOKUP_COST;
+        uint rowlen= get_tmp_table_rec_length(subq_select->ref_pointer_array,
+                                              subq_select->item_list.elements);
+        double lookup_cost= get_tmp_table_lookup_cost(join->thd,
+                                                      subjoin_out_rows, rowlen);
+        double write_cost= get_tmp_table_write_cost(join->thd,
+                                                    subjoin_out_rows, rowlen);
 
         /*
           Let materialization cost include the cost to write the data into the
           temporary table:
         */ 
-        sjm->materialization_cost.add_io(subjoin_out_rows, lookup_cost);
+        sjm->materialization_cost.add_io(subjoin_out_rows, write_cost);
         
         /*
           Set the cost to do a full scan of the temptable (will need this to 
@@ -1244,6 +1898,7 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
   DBUG_RETURN(FALSE);
 }
 
+
 /*
   Get estimated record length for semi-join materialization temptable
   
@@ -1261,13 +1916,15 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
     Length of the temptable record, in bytes
 */
 
-static uint get_tmp_table_rec_length(List<Item> &items)
+static uint get_tmp_table_rec_length(Item **p_items, uint elements)
 {
   uint len= 0;
   Item *item;
-  List_iterator<Item> it(items);
-  while ((item= it++))
+  //List_iterator<Item> it(items);
+  Item **p_item;
+  for (p_item= p_items; p_item < p_items + elements ; p_item++)
   {
+    item = *p_item;
     switch (item->result_type()) {
     case REAL_RESULT:
       len += sizeof(double);
@@ -1300,7 +1957,51 @@ static uint get_tmp_table_rec_length(List<Item> &items)
   return len;
 }
 
-//psergey-todo: is the below a kind of table elimination??
+
+/**
+  The cost of a lookup into a unique hash/btree index on a temporary table
+  with 'row_count' rows each of size 'row_size'.
+
+  @param thd  current query context
+  @param row_count  number of rows in the temp table
+  @param row_size   average size in bytes of the rows
+
+  @return  the cost of one lookup
+*/
+
+static double
+get_tmp_table_lookup_cost(THD *thd, double row_count, uint row_size)
+{
+  if (row_count * row_size > thd->variables.max_heap_table_size)
+    return (double) DISK_TEMPTABLE_LOOKUP_COST;
+  else
+    return (double) HEAP_TEMPTABLE_LOOKUP_COST;
+}
+
+/**
+  The cost of writing a row into a temporary table with 'row_count' unique
+  rows each of size 'row_size'.
+
+  @param thd  current query context
+  @param row_count  number of rows in the temp table
+  @param row_size   average size in bytes of the rows
+
+  @return  the cost of writing one row
+*/
+
+static double
+get_tmp_table_write_cost(THD *thd, double row_count, uint row_size)
+{
+  double lookup_cost= get_tmp_table_lookup_cost(thd, row_count, row_size);
+  /*
+    TODO:
+    This is an optimistic estimate. Add additional costs resulting from
+    actually writing the row to memory/disk and possible index reorganization.
+  */
+  return lookup_cost;
+}
+
+
 /*
   Check if table's KEYUSE elements have an eq_ref(outer_tables) candidate
 
@@ -1317,6 +2018,8 @@ static uint get_tmp_table_rec_length(List<Item> &items)
     Check again if it is feasible to factor common parts with constant table
     search
 
+    Also check if it's feasible to factor common parts with table elimination
+
   RETURN
     TRUE  - There exists an eq_ref(outer-tables) candidate
     FALSE - Otherwise
@@ -1325,16 +2028,21 @@ static uint get_tmp_table_rec_length(List<Item> &items)
 bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables)
 {
   KEYUSE *keyuse= table->reginfo.join_tab->keyuse;
-  uint key;
 
   if (keyuse)
   {
-    while (1) /* For each key */
+    do
     {
-      key= keyuse->key;
-      KEY *keyinfo= table->key_info + key;
+      uint key= keyuse->key;
+      KEY *keyinfo;
       key_part_map bound_parts= 0;
-      if (keyinfo->flags & HA_NOSAME)
+      bool is_excluded_key= keyuse->is_for_hash_join(); 
+      if (!is_excluded_key)
+      {
+        keyinfo= table->key_info + key;
+        is_excluded_key= !test(keyinfo->flags & HA_NOSAME);
+      }
+      if (!is_excluded_key)
       {
         do  /* For all equalities on all key parts */
         {
@@ -1349,24 +2057,20 @@ bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables)
 
         if (bound_parts == PREV_BITS(uint, keyinfo->key_parts))
           return TRUE;
-        if (keyuse->table != table)
-          return FALSE;
       }
       else
       {
         do
         {
           keyuse++;
-          if (keyuse->table != table)
-            return FALSE;
-        }
-        while (keyuse->key == key);
+        } while (keyuse->key == key && keyuse->table == table);
       }
-    }
+    } while (keyuse->table == table);
   }
   return FALSE;
 }
 
+
 /*
   Do semi-join optimization step after we've added a new tab to join prefix
 
@@ -1423,15 +2127,17 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
   TABLE_LIST *emb_sj_nest;
   POSITION *pos= join->positions + idx;
   remaining_tables &= ~new_join_tab->table->map;
+  bool disable_jbuf= join->thd->variables.join_cache_level == 0;
 
   pos->prefix_cost.convert_from_cost(*current_read_time);
   pos->prefix_record_count= *current_record_count;
   pos->sj_strategy= SJ_OPT_NONE;
   
+  pos->prefix_dups_producing_tables= join->cur_dups_producing_tables;
   /* Initialize the state or copy it from prev. tables */
   if (idx == join->const_tables)
   {
-    pos->first_firstmatch_table= MAX_TABLES;
+    pos->invalidate_firstmatch_prefix();
     pos->first_loosescan_table= MAX_TABLES; 
     pos->dupsweedout_tables= 0;
     pos->sjm_scan_need_tables= 0;
@@ -1466,7 +2172,8 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
   table_map handled_by_fm_or_ls= 0;
   /* FirstMatch Strategy */
   if (new_join_tab->emb_sj_nest &&
-      optimizer_flag(join->thd, OPTIMIZER_SWITCH_FIRSTMATCH))
+      optimizer_flag(join->thd, OPTIMIZER_SWITCH_FIRSTMATCH) &&
+      !join->outer_join)
   {
     const table_map outer_corr_tables=
       new_join_tab->emb_sj_nest->nested_join->sj_corr_tables |
@@ -1496,7 +2203,7 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
       pos->first_firstmatch_rtbl= remaining_tables;
     }
 
-    if (pos->first_firstmatch_table != MAX_TABLES)
+    if (pos->in_firstmatch_prefix())
     {
       if (outer_corr_tables & pos->first_firstmatch_rtbl)
       {
@@ -1504,7 +2211,7 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
           Trying to add an sj-inner table whose sj-nest has an outer correlated 
           table that was not in the prefix. This means FirstMatch can't be used.
         */
-        pos->first_firstmatch_table= MAX_TABLES;
+        pos->invalidate_firstmatch_prefix();
       }
       else
       {
@@ -1512,17 +2219,17 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
         pos->firstmatch_need_tables|= sj_inner_tables;
       }
     
-      if (!(pos->firstmatch_need_tables & remaining_tables))
+      if (pos->in_firstmatch_prefix() && 
+          !(pos->firstmatch_need_tables & remaining_tables))
       {
         /*
           Got a complete FirstMatch range.
             Calculate correct costs and fanout
         */
-        double reopt_cost, reopt_rec_count, sj_inner_fanout;
         optimize_wo_join_buffering(join, pos->first_firstmatch_table, idx,
                                    remaining_tables, FALSE, idx,
-                                   &reopt_rec_count, &reopt_cost, 
-                                   &sj_inner_fanout);
+                                   current_record_count, 
+                                   current_read_time);
         /*
           We don't yet know what are the other strategies, so pick the
           FirstMatch.
@@ -1533,8 +2240,6 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
           alternate POSITIONs after we've picked the best QEP.
         */
         pos->sj_strategy= SJ_OPT_FIRST_MATCH;
-        *current_read_time=    reopt_cost;
-        *current_record_count= reopt_rec_count / sj_inner_fanout;
         handled_by_fm_or_ls=  pos->firstmatch_need_tables;
       }
     }
@@ -1563,7 +2268,7 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
       If we got an option to use LooseScan for the current table, start
       considering using LooseScan strategy
     */
-    if (loose_scan_pos->read_time != DBL_MAX)
+    if (loose_scan_pos->read_time != DBL_MAX && !join->outer_join)
     {
       pos->first_loosescan_table= idx;
       pos->loosescan_need_tables=
@@ -1583,7 +2288,6 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
       first=join->positions + pos->first_loosescan_table; 
       uint n_tables= my_count_bits(first->table->emb_sj_nest->sj_inner_tables);
       /* Got a complete LooseScan range. Calculate its cost */
-      double reopt_cost, reopt_rec_count, sj_inner_fanout;
       /*
         The same problem as with FirstMatch - we need to save POSITIONs
         somewhere but reserving space for all cases would require too
@@ -1592,9 +2296,10 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
       optimize_wo_join_buffering(join, pos->first_loosescan_table, idx,
                                  remaining_tables, 
                                  TRUE,  //first_alt
-                                 pos->first_loosescan_table + n_tables,
-                                 &reopt_rec_count, 
-                                 &reopt_cost, &sj_inner_fanout);
+                                 disable_jbuf ? join->table_count :
+                                   pos->first_loosescan_table + n_tables,
+                                 current_record_count,
+                                 current_read_time);
       /*
         We don't yet have any other strategies that could handle this
         semi-join nest (the other options are Duplicate Elimination or
@@ -1603,8 +2308,6 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
         LooseScan.
       */
       pos->sj_strategy= SJ_OPT_LOOSE_SCAN;
-      *current_read_time=    reopt_cost;
-      *current_record_count= reopt_rec_count / sj_inner_fanout;
       handled_by_fm_or_ls= first->table->emb_sj_nest->sj_inner_tables;
     }
   }
@@ -1733,8 +2436,8 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
     /* Need to re-run best-access-path as we prefix_rec_count has changed */
     for (i= first_tab + mat_info->tables; i <= idx; i++)
     {
-      best_access_path(join, join->positions[i].table, rem_tables, i, FALSE,
-                       prefix_rec_count, &curpos, &dummy);
+      best_access_path(join, join->positions[i].table, rem_tables, i,
+                       disable_jbuf, prefix_rec_count, &curpos, &dummy);
       prefix_rec_count *= curpos.records_read;
       prefix_cost += curpos.read_time;
     }
@@ -1773,6 +2476,15 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
                                  nest->nested_join->sj_depends_on |
                                  nest->nested_join->sj_corr_tables;
     }
+    
+    if (pos->dupsweedout_tables)
+    {
+      /* we're in the process of constructing a DuplicateWeedout range */
+      TABLE_LIST *emb= new_join_tab->table->pos_in_table_list->embedding;
+      /* and we've entered an inner side of an outer join*/
+      if (emb && emb->on_expr)
+        pos->dupsweedout_tables |= emb->nested_join->used_tables;
+    }
 
     if (pos->dupsweedout_tables && 
         !(remaining_tables &
@@ -1835,15 +2547,15 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
         - sj_inner_fanout*sj_outer_fanout  lookups.
 
       */
-      double one_lookup_cost;
-      if (sj_outer_fanout*temptable_rec_size > 
-          join->thd->variables.max_heap_table_size)
-        one_lookup_cost= DISK_TEMPTABLE_LOOKUP_COST;
-      else
-        one_lookup_cost= HEAP_TEMPTABLE_LOOKUP_COST;
+      double one_lookup_cost= get_tmp_table_lookup_cost(join->thd,
+                                                        sj_outer_fanout,
+                                                        temptable_rec_size);
+      double one_write_cost= get_tmp_table_write_cost(join->thd,
+                                                      sj_outer_fanout,
+                                                      temptable_rec_size);
 
       double write_cost= join->positions[first_tab].prefix_record_count* 
-                         sj_outer_fanout * one_lookup_cost;
+                         sj_outer_fanout * one_write_cost;
       double full_lookup_cost= join->positions[first_tab].prefix_record_count* 
                                sj_outer_fanout* sj_inner_fanout * 
                                one_lookup_cost;
@@ -1861,7 +2573,7 @@ void advance_sj_state(JOIN *join, table_map remaining_tables,
       {
         pos->sj_strategy= SJ_OPT_DUPS_WEEDOUT;
         *current_read_time= dups_cost;
-        *current_record_count= *current_record_count / sj_inner_fanout;
+        *current_record_count= prefix_rec_count * sj_outer_fanout;
         join->cur_dups_producing_tables &= ~dups_removed_fanout;
       }
     }
@@ -1887,6 +2599,8 @@ void restore_prev_sj_state(const table_map remaining_tables,
       tab->join->cur_sj_inner_tables &= ~emb_sj_nest->sj_inner_tables;
     }
   }
+  POSITION *pos= tab->join->positions + idx;
+  tab->join->cur_dups_producing_tables= pos->prefix_dups_producing_tables;
 }
 
 
@@ -2029,7 +2743,7 @@ at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
 
 void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
 {
-  uint table_count=join->tables;
+  uint table_count=join->table_count;
   uint tablenr;
   table_map remaining_tables= 0;
   table_map handled_tabs= 0;
@@ -2091,8 +2805,9 @@ void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
       join->cur_sj_inner_tables= 0;
       for (i= first + sjm->tables; i <= tablenr; i++)
       {
-        best_access_path(join, join->best_positions[i].table, rem_tables, i, FALSE,
-                         prefix_rec_count, join->best_positions + i, &dummy);
+        best_access_path(join, join->best_positions[i].table, rem_tables, i, 
+                         FALSE, prefix_rec_count,
+                         join->best_positions + i, &dummy);
         prefix_rec_count *= join->best_positions[i].records_read;
         rem_tables &= ~join->best_positions[i].table->table->map;
       }
@@ -2190,9 +2905,11 @@ void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
     remaining_tables |= s->table->map;
     //s->sj_strategy= pos->sj_strategy;
     join->join_tab[first].sj_strategy= join->best_positions[first].sj_strategy;
+    join->join_tab[first].n_sj_tables= join->best_positions[first].n_sj_tables;
   }
 }
 
+
 /*
   Setup semi-join materialization strategy for one semi-join nest
   
@@ -2214,26 +2931,31 @@ void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
     TRUE   Error
 */
 
-bool setup_sj_materialization(JOIN_TAB *tab)
+bool setup_sj_materialization_part1(JOIN_TAB *sjm_tab)
 {
-  uint i;
   DBUG_ENTER("setup_sj_materialization");
+  JOIN_TAB *tab= sjm_tab->bush_children->start;
   TABLE_LIST *emb_sj_nest= tab->table->pos_in_table_list->embedding;
   SJ_MATERIALIZATION_INFO *sjm= emb_sj_nest->sj_mat_info;
   THD *thd= tab->join->thd;
   /* First the calls come to the materialization function */
-  List<Item> &item_list= emb_sj_nest->sj_subq_pred->unit->first_select()->item_list;
-
+  //List<Item> &item_list= emb_sj_nest->sj_subq_pred->unit->first_select()->item_list;
+  
+  DBUG_ASSERT(sjm->is_used);
   /* 
     Set up the table to write to, do as select_union::create_result_table does
   */
   sjm->sjm_table_param.init();
-  sjm->sjm_table_param.field_count= item_list.elements;
   sjm->sjm_table_param.bit_fields_as_long= TRUE;
-  List_iterator<Item> it(item_list);
-  Item *right_expr;
-  while((right_expr= it++))
-    sjm->sjm_table_cols.push_back(right_expr);
+  //List_iterator<Item> it(item_list);
+  SELECT_LEX *subq_select= emb_sj_nest->sj_subq_pred->unit->first_select();
+  Item **p_item= subq_select->ref_pointer_array;
+  Item **p_end= p_item + subq_select->item_list.elements;
+  //while((right_expr= it++))
+  for(;p_item != p_end; p_item++)
+    sjm->sjm_table_cols.push_back(*p_item);
+
+  sjm->sjm_table_param.field_count= subq_select->item_list.elements;
 
   if (!(sjm->table= create_tmp_table(thd, &sjm->sjm_table_param, 
                                      sjm->sjm_table_cols, (ORDER*) 0, 
@@ -2245,10 +2967,29 @@ bool setup_sj_materialization(JOIN_TAB *tab)
     DBUG_RETURN(TRUE); /* purecov: inspected */
   sjm->table->file->extra(HA_EXTRA_WRITE_CACHE);
   sjm->table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+
   tab->join->sj_tmp_tables.push_back(sjm->table);
   tab->join->sjm_info_list.push_back(sjm);
   
   sjm->materialized= FALSE;
+  sjm_tab->table= sjm->table;
+  sjm->table->pos_in_table_list= emb_sj_nest;
+ 
+  DBUG_RETURN(FALSE);
+}
+
+
+bool setup_sj_materialization_part2(JOIN_TAB *sjm_tab)
+{
+  DBUG_ENTER("setup_sj_materialization_part2");
+  JOIN_TAB *tab= sjm_tab->bush_children->start;
+  TABLE_LIST *emb_sj_nest= tab->table->pos_in_table_list->embedding;
+  SJ_MATERIALIZATION_INFO *sjm= emb_sj_nest->sj_mat_info;
+  THD *thd= tab->join->thd;
+  uint i;
+  //List<Item> &item_list= emb_sj_nest->sj_subq_pred->unit->first_select()->item_list;
+  //List_iterator<Item> it(item_list);
+
   if (!sjm->is_sj_scan)
   {
     KEY           *tmp_key; /* The only index on the temporary table. */
@@ -2261,8 +3002,7 @@ bool setup_sj_materialization(JOIN_TAB *tab)
       temptable.
     */
     TABLE_REF *tab_ref;
-    if (!(tab_ref= (TABLE_REF*) thd->alloc(sizeof(TABLE_REF))))
-      DBUG_RETURN(TRUE); /* purecov: inspected */
+    tab_ref= &sjm_tab->ref;
     tab_ref->key= 0; /* The only temp table index. */
     tab_ref->key_length= tmp_key->key_length;
     if (!(tab_ref->key_buff=
@@ -2295,12 +3035,22 @@ bool setup_sj_materialization(JOIN_TAB *tab)
                                       use that information instead.
                                    */
                                    cur_ref_buff + null_count,
-                                   null_count ? tab_ref->key_buff : 0,
+                                   null_count ? cur_ref_buff : 0,
                                    cur_key_part->length, tab_ref->items[i],
                                    FALSE);
       cur_ref_buff+= cur_key_part->store_length;
     }
     *ref_key= NULL; /* End marker. */
+      
+    /*
+      We don't ever have guarded conditions for SJM tables, but code at SQL
+      layer depends on cond_guards array being alloced.
+    */
+    if (!(tab_ref->cond_guards= (bool**) thd->calloc(sizeof(uint*)*tmp_key_parts)))
+    {
+      DBUG_RETURN(TRUE);
+    }
+
     tab_ref->key_err= 1;
     tab_ref->key_parts= tmp_key_parts;
     sjm->tab_ref= tab_ref;
@@ -2320,6 +3070,8 @@ bool setup_sj_materialization(JOIN_TAB *tab)
     if (!(sjm->in_equality= create_subq_in_equalities(thd, sjm,
                                                       emb_sj_nest->sj_subq_pred)))
       DBUG_RETURN(TRUE); /* purecov: inspected */
+    sjm_tab->type= JT_EQ_REF;
+    sjm_tab->select_cond= sjm->in_equality;
   }
   else
   {
@@ -2351,12 +3103,14 @@ bool setup_sj_materialization(JOIN_TAB *tab)
       in the record buffers for the source tables. 
     */
     sjm->copy_field= new Copy_field[sjm->sjm_table_cols.elements];
-    it.rewind();
+    //it.rewind();
+    Item **p_item= emb_sj_nest->sj_subq_pred->unit->first_select()->ref_pointer_array;
     for (uint i=0; i < sjm->sjm_table_cols.elements; i++)
     {
       bool dummy;
       Item_equal *item_eq;
-      Item *item= (it++)->real_item();
+      //Item *item= (it++)->real_item();
+      Item *item= (*(p_item++))->real_item();
       DBUG_ASSERT(item->type() == Item::FIELD_ITEM);
       Field *copy_to= ((Item_field*)item)->field;
       /*
@@ -2372,9 +3126,11 @@ bool setup_sj_materialization(JOIN_TAB *tab)
          then substitute_for_best_equal_field() will change the conditions
          according to the join order:
 
-           it1
-           it2    it1.col=it2.col
-           ot     cond(it1.col)
+         table | attached condition
+         ------+--------------------
+          it1  |
+          it2  | it1.col=it2.col
+          ot   | cond(it1.col)
 
          although we've originally had "SELECT it2.col", conditions attached 
          to subsequent outer tables will refer to it1.col, so SJM-Scan will
@@ -2388,13 +3144,17 @@ bool setup_sj_materialization(JOIN_TAB *tab)
 
       if (item_eq)
       {
-        List_iterator<Item_field> it(item_eq->fields);
-        Item_field *item;
+        List_iterator<Item> it(item_eq->equal_items);
+        /* We're interested in field items only */
+        if (item_eq->get_const())
+          it++;
+        Item *item;
         while ((item= it++))
         {
           if (!(item->used_tables() & ~emb_sj_nest->sj_inner_tables))
           {
-            copy_to= item->field;
+            DBUG_ASSERT(item->real_item()->type() == Item::FIELD_ITEM);
+            copy_to= ((Item_field *) (item->real_item()))->field;
             break;
           }
         }
@@ -2403,8 +3163,18 @@ bool setup_sj_materialization(JOIN_TAB *tab)
       /* The write_set for source tables must be set up to allow the copying */
       bitmap_set_bit(copy_to->table->write_set, copy_to->field_index);
     }
+    sjm_tab->type= JT_ALL;
+
+    /* Initialize full scan */
+    sjm_tab->read_first_record= join_read_record_no_init;
+    sjm_tab->read_record.copy_field= sjm->copy_field;
+    sjm_tab->read_record.copy_field_end= sjm->copy_field +
+                                         sjm->sjm_table_cols.elements;
+    sjm_tab->read_record.read_record= rr_sequential_and_unpack;
   }
 
+  sjm_tab->bush_children->end[-1].next_select= end_sj_materialize;
+
   DBUG_RETURN(FALSE);
 }
 
@@ -2610,7 +3380,8 @@ TABLE *create_duplicate_weedout_tmp_table(THD *thd,
   thd->mem_root= &table->mem_root;
 
   table->field=reg_field;
-  table->alias= "weedout-tmp";
+  table->alias.set("weedout-tmp", sizeof("weedout-tmp")-1,
+                   table_alias_charset);
   table->reginfo.lock_type=TL_WRITE;	/* Will be updated */
   table->db_stat=HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
   table->map=1;
@@ -2625,7 +3396,6 @@ TABLE *create_duplicate_weedout_tmp_table(THD *thd,
   init_tmp_table_share(thd, share, "", 0, tmpname, tmpname);
   share->blob_field= blob_field;
   share->blob_ptr_size= portable_sizeof_char_ptr;
-  share->db_low_byte_first=1;                // True for HEAP and MyISAM
   share->table_charset= NULL;
   share->primary_key= MAX_KEY;               // Indicate no primary key
   share->keys_for_keyread.init();
@@ -2742,12 +3512,9 @@ TABLE *create_duplicate_weedout_tmp_table(THD *thd,
     else
       recinfo->type=FIELD_NORMAL;
 
-    field->table_name= &table->alias;
+    field->set_table_name(&table->alias);
   }
 
-  //param->recinfo=recinfo;
-  //store_record(table,s->default_values);        // Make empty default record
-
   if (thd->variables.tmp_table_size == ~ (ulonglong) 0)		// No limit
     share->max_rows= ~(ha_rows) 0;
   else
@@ -2793,12 +3560,15 @@ TABLE *create_duplicate_weedout_tmp_table(THD *thd,
     }
   }
 
-  if (thd->is_fatal_error)				// If end of memory
+  if (thd->is_fatal_error)			// If end of memory
     goto err;
   share->db_record_offset= 1;
+  table->no_rows= 1;              		// We don't need the data
+
+  // recinfo must point after last field
+  recinfo++;
   if (share->db_type() == TMP_ENGINE_HTON)
   {
-    recinfo++;
     if (create_internal_tmp_table(table, keyinfo, start_recinfo, &recinfo, 0, 0))
       goto err;
   }
@@ -2874,7 +3644,6 @@ int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl)
   }
 
   ptr= sjtbl->tmp_table->record[0] + 1;
-  nulls_ptr= ptr;
 
   /* Put the the rowids tuple into table->record[0]: */
 
@@ -2890,6 +3659,7 @@ int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl)
     ptr += 2;
   }
 
+  nulls_ptr= ptr;
   // 2. Zero the null bytes 
   if (sjtbl->null_bytes)
   {
@@ -2914,7 +3684,7 @@ int do_sj_dups_weedout(THD *thd, SJ_TMP_TABLE *sjtbl)
     }
   }
 
-  error= sjtbl->tmp_table->file->ha_write_row(sjtbl->tmp_table->record[0]);
+  error= sjtbl->tmp_table->file->ha_write_tmp_row(sjtbl->tmp_table->record[0]);
   if (error)
   {
     /* create_internal_tmp_table_from_heap will generate error if needed */
@@ -3031,17 +3801,19 @@ int setup_semijoin_dups_elimination(JOIN *join, ulonglong options,
   uint i;
   THD *thd= join->thd;
   DBUG_ENTER("setup_semijoin_dups_elimination");
-
-  for (i= join->const_tables ; i < join->tables; )
+  
+  POSITION *pos= join->best_positions + join->const_tables;
+  for (i= join->const_tables ; i < join->top_join_tab_count; )
   {
     JOIN_TAB *tab=join->join_tab + i;
-    POSITION *pos= join->best_positions + i;
+    //POSITION *pos= join->best_positions + i;
     uint keylen, keyno;
     switch (pos->sj_strategy) {
       case SJ_OPT_MATERIALIZE:
       case SJ_OPT_MATERIALIZE_SCAN:
         /* Do nothing */
-        i+= pos->n_sj_tables;
+        i+= 1;// It used to be pos->n_sj_tables, but now they are embedded in a nest
+        pos += pos->n_sj_tables;
         break;
       case SJ_OPT_LOOSE_SCAN:
       {
@@ -3058,6 +3830,7 @@ int setup_semijoin_dups_elimination(JOIN *join, ulonglong options,
         if (pos->n_sj_tables > 1) 
           tab[pos->n_sj_tables - 1].do_firstmatch= tab;
         i+= pos->n_sj_tables;
+        pos+= pos->n_sj_tables;
         break;
       }
       case SJ_OPT_DUPS_WEEDOUT:
@@ -3121,7 +3894,7 @@ int setup_semijoin_dups_elimination(JOIN *join, ulonglong options,
         SJ_TMP_TABLE *sjtbl;
         if (jt_rowid_offset) /* Temptable has at least one rowid */
         {
-          uint tabs_size= (last_tab - sjtabs) * sizeof(SJ_TMP_TABLE::TAB);
+          size_t tabs_size= (last_tab - sjtabs) * sizeof(SJ_TMP_TABLE::TAB);
           if (!(sjtbl= (SJ_TMP_TABLE*)thd->alloc(sizeof(SJ_TMP_TABLE))) ||
               !(sjtbl->tabs= (SJ_TMP_TABLE::TAB*) thd->alloc(tabs_size)))
             DBUG_RETURN(TRUE); /* purecov: inspected */
@@ -3155,6 +3928,7 @@ int setup_semijoin_dups_elimination(JOIN *join, ulonglong options,
         join->join_tab[i + pos->n_sj_tables - 1].check_weed_out_table= sjtbl;
 
         i+= pos->n_sj_tables;
+        pos+= pos->n_sj_tables;
         break;
       }
       case SJ_OPT_FIRST_MATCH:
@@ -3177,10 +3951,12 @@ int setup_semijoin_dups_elimination(JOIN *join, ulonglong options,
         }
         j[-1].do_firstmatch= jump_to;
         i+= pos->n_sj_tables;
+        pos+= pos->n_sj_tables;
         break;
       }
       case SJ_OPT_NONE:
         i++;
+        pos++;
         break;
     }
   }
@@ -3361,13 +4137,27 @@ int rewrite_to_index_subquery_engine(JOIN *join)
   JOIN_TAB* join_tab=join->join_tab;
   SELECT_LEX_UNIT *unit= join->unit;
   DBUG_ENTER("rewrite_to_index_subquery_engine");
+
   /*
     is this simple IN subquery?
   */
+  /* TODO: In order to use these more efficient subquery engines in more cases,
+     the following problems need to be solved:
+     - the code that removes GROUP BY (group_list), also adds an ORDER BY
+       (order), thus GROUP BY queries (almost?) never pass through this branch.
+       Solution: remove the test below '!join->order', because we remove the
+       ORDER clase for subqueries anyway.
+     - in order to set a more efficient engine, the optimizer needs to both
+       decide to remove GROUP BY, *and* select one of the JT_[EQ_]REF[_OR_NULL]
+       access methods, *and* loose scan should be more expensive or
+       inapliccable. When is that possible?
+     - Consider expanding the applicability of this rewrite for loose scan
+       for group by queries.
+  */
   if (!join->group_list && !join->order &&
       join->unit->item && 
       join->unit->item->substype() == Item_subselect::IN_SUBS &&
-      join->tables == 1 && join->conds &&
+      join->table_count == 1 && join->conds &&
       !join->unit->is_union())
   {
     if (!join->having)
@@ -3504,3 +4294,438 @@ static void remove_subq_pushed_predicates(JOIN *join, Item **where)
 }
 
 
+
+
+/**
+  Optimize all subqueries of a query that were not flattened into a semijoin.
+
+  @details
+  Optimize all immediate children subqueries of a query.
+
+  This phase must be called after substitute_for_best_equal_field() because
+  that function may replace items with other items from a multiple equality,
+  and we need to reference the correct items in the index access method of the
+  IN predicate.
+
+  @return Operation status
+  @retval FALSE     success.
+  @retval TRUE      error occurred.
+*/
+
+bool JOIN::optimize_unflattened_subqueries()
+{
+  return select_lex->optimize_unflattened_subqueries();
+}
+
+
+/*
+  Join tab execution startup function.
+
+  SYNOPSIS
+    join_tab_execution_startup()
+      tab  Join tab to perform startup actions for
+
+  DESCRIPTION
+    Join tab execution startup function. This is different from
+    tab->read_first_record in the regard that this has actions that are to be
+    done once per join execution.
+
+    Currently there are only two possible startup functions, so we have them
+    both here inside if (...) branches. In future we could switch to function
+    pointers.
+
+  TODO: consider moving this together with JOIN_TAB::preread_init
+  
+  RETURN 
+    NESTED_LOOP_OK - OK
+    NESTED_LOOP_ERROR| NESTED_LOOP_KILLED - Error, abort the join execution
+*/
+
+enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab)
+{
+  Item_in_subselect *in_subs;
+  DBUG_ENTER("join_tab_execution_startup");
+  
+  if (tab->table->pos_in_table_list && 
+      (in_subs= tab->table->pos_in_table_list->jtbm_subselect))
+  {
+    /* It's a non-merged SJM nest */
+    DBUG_ASSERT(in_subs->engine->engine_type() ==
+                subselect_engine::HASH_SJ_ENGINE);
+    subselect_hash_sj_engine *hash_sj_engine=
+      ((subselect_hash_sj_engine*)in_subs->engine);
+    if (!hash_sj_engine->is_materialized)
+    {
+      hash_sj_engine->materialize_join->exec();
+      hash_sj_engine->is_materialized= TRUE; 
+
+      if (hash_sj_engine->materialize_join->error || tab->join->thd->is_fatal_error)
+        DBUG_RETURN(NESTED_LOOP_ERROR);
+    }
+  }
+  else if (tab->bush_children)
+  {
+    /* It's a merged SJM nest */
+    enum_nested_loop_state rc;
+    SJ_MATERIALIZATION_INFO *sjm= tab->bush_children->start->emb_sj_nest->sj_mat_info;
+
+    if (!sjm->materialized)
+    {
+      JOIN *join= tab->join;
+      JOIN_TAB *join_tab= tab->bush_children->start;
+      JOIN_TAB *save_return_tab= join->return_tab;
+      /*
+        Now run the join for the inner tables. The first call is to run the
+        join, the second one is to signal EOF (this is essential for some
+        join strategies, e.g. it will make join buffering flush the records)
+      */
+      if ((rc= sub_select(join, join_tab, FALSE/* no EOF */)) < 0 ||
+          (rc= sub_select(join, join_tab, TRUE/* now EOF */)) < 0)
+      {
+        join->return_tab= save_return_tab;
+        DBUG_RETURN(rc); /* it's NESTED_LOOP_(ERROR|KILLED)*/
+      }
+      join->return_tab= save_return_tab;
+      sjm->materialized= TRUE;
+    }
+  }
+
+  DBUG_RETURN(NESTED_LOOP_OK);
+}
+
+
+/**
+  Choose an optimal strategy to execute an IN/ALL/ANY subquery predicate
+  based on cost.
+
+  @param join_tables  the set of tables joined in the subquery
+
+  @notes
+  The method chooses between the materialization and IN=>EXISTS rewrite
+  strategies for the execution of a non-flattened subquery IN predicate.
+  The cost-based decision is made as follows:
+
+  1. compute materialize_strategy_cost based on the unmodified subquery
+  2. reoptimize the subquery taking into account the IN-EXISTS predicates
+  3. compute in_exists_strategy_cost based on the reoptimized plan
+  4. compare and set the cheaper strategy
+     if (materialize_strategy_cost >= in_exists_strategy_cost)
+       in_strategy = MATERIALIZATION
+     else
+       in_strategy = IN_TO_EXISTS
+  5. if in_strategy = MATERIALIZATION and it is not possible to initialize it
+       revert to IN_TO_EXISTS
+  6. if (in_strategy == MATERIALIZATION)
+       revert the subquery plan to the original one before reoptimizing
+     else
+       inject the IN=>EXISTS predicates into the new EXISTS subquery plan
+
+  The implementation itself is a bit more complicated because it takes into
+  account two more factors:
+  - whether the user allowed both strategies through an optimizer_switch, and
+  - if materialization was the cheaper strategy, whether it can be executed
+    or not.
+
+  @retval FALSE     success.
+  @retval TRUE      error occurred.
+*/
+
+bool JOIN::choose_subquery_plan(table_map join_tables)
+{
+  Join_plan_state save_qep; /* The original QEP of the subquery. */
+  enum_reopt_result reopt_result= REOPT_NONE;
+  Item_in_subselect *in_subs;
+
+  if (is_in_subquery())
+  {
+    in_subs= (Item_in_subselect*) unit->item;
+    if (in_subs->create_in_to_exists_cond(this))
+      return true;
+  }
+  else
+    return false;
+
+  DBUG_ASSERT(in_subs->in_strategy); /* A strategy must be chosen earlier. */
+  DBUG_ASSERT(in_to_exists_where || in_to_exists_having);
+  DBUG_ASSERT(!in_to_exists_where || in_to_exists_where->fixed);
+  DBUG_ASSERT(!in_to_exists_having || in_to_exists_having->fixed);
+
+  /*
+    Compute and compare the costs of materialization and in-exists if both
+    strategies are possible and allowed by the user (checked during the prepare
+    phase.
+  */
+  if (in_subs->in_strategy & SUBS_MATERIALIZATION &&
+      in_subs->in_strategy & SUBS_IN_TO_EXISTS)
+  {
+    JOIN *outer_join;
+    JOIN *inner_join= this;
+    /* Number of unique value combinations filtered by the IN predicate. */
+    double outer_lookup_keys;
+    /* Cost and row count of the unmodified subquery. */
+    double inner_read_time_1, inner_record_count_1;
+    /* Cost of the subquery with injected IN-EXISTS predicates. */
+    double inner_read_time_2;
+    /* The cost to compute IN via materialization. */
+    double materialize_strategy_cost;
+    /* The cost of the IN->EXISTS strategy. */
+    double in_exists_strategy_cost;
+    double dummy;
+
+    /*
+      A. Estimate the number of rows of the outer table that will be filtered
+      by the IN predicate.
+    */
+    outer_join= unit->outer_select() ? unit->outer_select()->join : NULL;
+    if (outer_join && outer_join->table_count > 0)
+    {
+      /*
+        The index of the last JOIN_TAB in the outer JOIN where in_subs is
+        attached (pushed to).
+      */
+      uint max_outer_join_tab_idx;
+      /*
+        Make_cond_for_table is called for predicates only in the WHERE/ON
+        clauses. In all other cases, predicates are not pushed to any
+        JOIN_TAB, and their join_tab_idx remains MAX_TABLES. Such predicates
+        are evaluated for each complete row of the outer join.
+      */
+      max_outer_join_tab_idx= (in_subs->get_join_tab_idx() == MAX_TABLES) ?
+                               outer_join->table_count - 1:
+                               in_subs->get_join_tab_idx();
+      /*
+        TODO:
+        Currently outer_lookup_keys is computed as the number of rows in
+        the partial join including the JOIN_TAB where the IN predicate is
+        pushed to. In the general case this is a gross overestimate because
+        due to caching we are interested only in the number of unique keys.
+        The search key may be formed by columns from much fewer than all
+        tables in the partial join. Example:
+        select * from t1, t2 where t1.c1 = t2.key AND t2.c2 IN (select ...);
+        If the join order: t1, t2, the number of unique lookup keys is ~ to
+        the number of unique values t2.c2 in the partial join t1 join t2.
+      */
+      outer_join->get_partial_cost_and_fanout(max_outer_join_tab_idx,
+                                              table_map(-1),
+                                              &dummy,
+                                              &outer_lookup_keys);
+    }
+    else
+    {
+      /*
+        TODO: outer_join can be NULL for DELETE statements.
+        How to compute its cost?
+      */
+      outer_lookup_keys= 1;
+    }
+
+    /*
+      B. Estimate the cost and number of records of the subquery both
+      unmodified, and with injected IN->EXISTS predicates.
+    */
+    inner_read_time_1= inner_join->best_read;
+    inner_record_count_1= inner_join->record_count;
+
+    if (in_to_exists_where && const_tables != table_count)
+    {
+      /*
+        Re-optimize and cost the subquery taking into account the IN-EXISTS
+        conditions.
+      */
+      reopt_result= reoptimize(in_to_exists_where, join_tables, &save_qep);
+      if (reopt_result == REOPT_ERROR)
+        return TRUE;
+
+      /* Get the cost of the modified IN-EXISTS plan. */
+      inner_read_time_2= inner_join->best_read;
+
+    }
+    else
+    {
+      /* Reoptimization would not produce any better plan. */
+      inner_read_time_2= inner_read_time_1;
+    }
+
+    /*
+      C. Compute execution costs.
+    */
+    /* C.1 Compute the cost of the materialization strategy. */
+    //uint rowlen= get_tmp_table_rec_length(unit->first_select()->item_list);
+    uint rowlen= get_tmp_table_rec_length(ref_pointer_array, 
+                                          select_lex->item_list.elements);
+    /* The cost of writing one row into the temporary table. */
+    double write_cost= get_tmp_table_write_cost(thd, inner_record_count_1,
+                                                rowlen);
+    /* The cost of a lookup into the unique index of the materialized table. */
+    double lookup_cost= get_tmp_table_lookup_cost(thd, inner_record_count_1,
+                                                  rowlen);
+    /*
+      The cost of executing the subquery and storing its result in an indexed
+      temporary table.
+    */
+    double materialization_cost= inner_read_time_1 +
+                                 write_cost * inner_record_count_1;
+
+    materialize_strategy_cost= materialization_cost +
+                               outer_lookup_keys * lookup_cost;
+
+    /* C.2 Compute the cost of the IN=>EXISTS strategy. */
+    in_exists_strategy_cost= outer_lookup_keys * inner_read_time_2;
+
+    /* C.3 Compare the costs and choose the cheaper strategy. */
+    if (materialize_strategy_cost >= in_exists_strategy_cost)
+      in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
+    else
+      in_subs->in_strategy&= ~SUBS_IN_TO_EXISTS;
+
+    DBUG_PRINT("info",
+               ("mat_strategy_cost: %.2f, mat_cost: %.2f, write_cost: %.2f, lookup_cost: %.2f",
+                materialize_strategy_cost, materialization_cost, write_cost, lookup_cost));
+    DBUG_PRINT("info",
+               ("inx_strategy_cost: %.2f, inner_read_time_2: %.2f",
+                in_exists_strategy_cost, inner_read_time_2));
+    DBUG_PRINT("info",("outer_lookup_keys: %.2f", outer_lookup_keys));
+  }
+
+  /*
+    If (1) materialization is a possible strategy based on semantic analysis
+    during the prepare phase, then if
+      (2) it is more expensive than the IN->EXISTS transformation, and
+      (3) it is not possible to create usable indexes for the materialization
+          strategy,
+      fall back to IN->EXISTS.
+    otherwise
+      use materialization.
+  */
+  if (in_subs->in_strategy & SUBS_MATERIALIZATION &&
+      in_subs->setup_mat_engine())
+  {
+    /*
+      If materialization was the cheaper or the only user-selected strategy,
+      but it is not possible to execute it due to limitations in the
+      implementation, fall back to IN-TO-EXISTS.
+    */
+    in_subs->in_strategy&= ~SUBS_MATERIALIZATION;
+    in_subs->in_strategy|= SUBS_IN_TO_EXISTS;
+  }
+
+  if (in_subs->in_strategy & SUBS_MATERIALIZATION)
+  {
+    /* Restore the original query plan used for materialization. */
+    if (reopt_result == REOPT_NEW_PLAN)
+      restore_query_plan(&save_qep);
+
+    in_subs->unit->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
+    select_lex->uncacheable&= ~UNCACHEABLE_DEPENDENT_INJECTED;
+
+    /*
+      Reset the "LIMIT 1" set in Item_exists_subselect::fix_length_and_dec.
+      TODO:
+      Currently we set the subquery LIMIT to infinity, and this is correct
+      because we forbid at parse time LIMIT inside IN subqueries (see
+      Item_in_subselect::test_limit). However, once we allow this, here
+      we should set the correct limit if given in the query.
+    */
+    in_subs->unit->global_parameters->select_limit= NULL;
+    in_subs->unit->set_limit(unit->global_parameters);
+    /*
+      Set the limit of this JOIN object as well, because normally its being
+      set in the beginning of JOIN::optimize, which was already done.
+    */
+    select_limit= in_subs->unit->select_limit_cnt;
+  }
+  else if (in_subs->in_strategy & SUBS_IN_TO_EXISTS)
+  {
+    if (reopt_result == REOPT_NONE && in_to_exists_where &&
+        const_tables != table_count)
+    {
+      /*
+        The subquery was not reoptimized either because the user allowed only
+        the IN-EXISTS strategy, or because materialization was not possible
+        based on semantic analysis. Cleanup the original plan and reoptimize.
+      */
+      for (uint i= 0; i < table_count; i++)
+      {
+        join_tab[i].keyuse= NULL;
+        join_tab[i].checked_keys.clear_all();
+      }
+      if ((reopt_result= reoptimize(in_to_exists_where, join_tables, NULL)) ==
+          REOPT_ERROR)
+        return TRUE;
+    }
+
+    if (in_subs->inject_in_to_exists_cond(this))
+      return TRUE;
+    /*
+      It is IN->EXISTS transformation so we should mark subquery as
+      dependent
+    */
+    in_subs->unit->uncacheable|= UNCACHEABLE_DEPENDENT_INJECTED;
+    select_lex->uncacheable|= UNCACHEABLE_DEPENDENT_INJECTED;
+    select_limit= 1;
+  }
+  else
+    DBUG_ASSERT(FALSE);
+
+  return FALSE;
+}
+
+
+/**
+  Choose a query plan for a table-less subquery.
+
+  @notes
+
+  @retval FALSE     success.
+  @retval TRUE      error occurred.
+*/
+
+bool JOIN::choose_tableless_subquery_plan()
+{
+  DBUG_ASSERT(!tables_list || !table_count);
+  if (unit->item)
+  {
+    DBUG_ASSERT(unit->item->type() == Item::SUBSELECT_ITEM);
+    Item_subselect *subs_predicate= unit->item;
+
+    /*
+      If the optimizer determined that his query has an empty result,
+      in most cases the subquery predicate is a known constant value -
+      either FALSE or NULL. The implementation of Item_subselect::reset()
+      determines which one.
+    */
+    if (zero_result_cause)
+    {
+      if (!implicit_grouping)
+      {
+        /*
+          Both group by queries and non-group by queries without aggregate
+          functions produce empty subquery result.
+        */
+        subs_predicate->reset();
+        subs_predicate->make_const();
+        return FALSE;
+      }
+
+      /* TODO:
+         A further optimization is possible when a non-group query with
+         MIN/MAX/COUNT is optimized by opt_sum_query. Then, if there are
+         only MIN/MAX functions over an empty result set, the subquery
+         result is a NULL value/row, thus the value of subs_predicate is
+         NULL.
+      */
+    }
+
+    if (subs_predicate->is_in_predicate())
+    {
+      Item_in_subselect *in_subs;
+      in_subs= (Item_in_subselect*) subs_predicate;
+      in_subs->in_strategy= SUBS_IN_TO_EXISTS;
+      if (in_subs->create_in_to_exists_cond(this) ||
+          in_subs->inject_in_to_exists_cond(this))
+        return TRUE;
+      tmp_having= having;
+    }
+  }
+  return FALSE;
+}
diff --git a/sql/opt_subselect.h b/sql/opt_subselect.h
index 532e43567e8..5a7416fe929 100644
--- a/sql/opt_subselect.h
+++ b/sql/opt_subselect.h
@@ -1,4 +1,6 @@
-/* */
+/*
+  Semi-join subquery optimization code definitions
+*/
 
 #ifdef USE_PRAGMA_INTERFACE
 #pragma interface			/* gcc class implementation */
@@ -199,7 +201,8 @@ public:
         double records= rows2double(s->table->file->stats.records);
 
         /* The cost is entire index scan cost (divided by 2) */
-        double read_time= s->table->file->keyread_time(key, 1, records);
+        double read_time= s->table->file->keyread_time(key, 1,
+                                                       (ha_rows) records);
 
         /*
           Now find out how many different keys we will get (for now we
@@ -282,7 +285,9 @@ void restore_prev_sj_state(const table_map remaining_tables,
                                   const JOIN_TAB *tab, uint idx);
 
 void fix_semijoin_strategies_for_picked_join_order(JOIN *join);
-bool setup_sj_materialization(JOIN_TAB *tab);
+
+bool setup_sj_materialization_part1(JOIN_TAB *sjm_tab);
+bool setup_sj_materialization_part2(JOIN_TAB *sjm_tab);
 
 TABLE *create_duplicate_weedout_tmp_table(THD *thd, uint uniq_tuple_length_arg,
                                           SJ_TMP_TABLE *sjtbl);
@@ -365,4 +370,10 @@ int clear_sj_tmp_tables(JOIN *join);
 int rewrite_to_index_subquery_engine(JOIN *join);
 
 
+void get_delayed_table_estimates(TABLE *table,
+                                 ha_rows *out_rows, 
+                                 double *scan_time,
+                                 double *startup_cost);
+
+enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab);
 
diff --git a/sql/opt_sum.cc b/sql/opt_sum.cc
index 381dd1ab21c..3aefa7c99d1 100644
--- a/sql/opt_sum.cc
+++ b/sql/opt_sum.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010 Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -75,10 +75,12 @@ static int maxmin_in_range(bool max_fl, Field* field, COND *cond);
     #			Multiplication of number of rows in all tables
 */
 
-static ulonglong get_exact_record_count(TABLE_LIST *tables)
+static ulonglong get_exact_record_count(List<TABLE_LIST> &tables)
 {
   ulonglong count= 1;
-  for (TABLE_LIST *tl= tables; tl; tl= tl->next_leaf)
+  TABLE_LIST *tl;
+  List_iterator<TABLE_LIST> ti(tables);
+  while ((tl= ti++))
   {
     ha_rows tmp= tl->table->file->records();
     if ((tmp == HA_POS_ERROR))
@@ -235,9 +237,11 @@ static int get_index_max_value(TABLE *table, TABLE_REF *ref, uint range_fl)
 */
 
 int opt_sum_query(THD *thd,
-                  TABLE_LIST *tables, List<Item> &all_fields, COND *conds)
+                  List<TABLE_LIST> &tables, List<Item> &all_fields, COND *conds)
 {
   List_iterator_fast<Item> it(all_fields);
+  List_iterator<TABLE_LIST> ti(tables);
+  TABLE_LIST *tl;
   int const_result= 1;
   bool recalc_const_item= 0;
   ulonglong count= 1;
@@ -245,8 +249,7 @@ int opt_sum_query(THD *thd,
   table_map removed_tables= 0, outer_tables= 0, used_tables= 0;
   table_map where_tables= 0;
   Item *item;
-  int error;
-
+  int error= 0;
   DBUG_ENTER("opt_sum_query");
 
   if (conds)
@@ -256,7 +259,7 @@ int opt_sum_query(THD *thd,
     Analyze outer join dependencies, and, if possible, compute the number
     of returned rows.
   */
-  for (TABLE_LIST *tl= tables; tl; tl= tl->next_leaf)
+  while ((tl= ti++))
   {
     TABLE_LIST *embedded;
     for (embedded= tl ; embedded; embedded= embedded->embedding)
@@ -297,6 +300,14 @@ int opt_sum_query(THD *thd,
       is_exact_count= FALSE;
       count= 1;                                 // ensure count != 0
     }
+    else if (tl->is_materialized_derived())
+    {
+      /*
+        Can't remove a derived table as it's number of rows is just an
+        estimate.
+      */
+      DBUG_RETURN(0);
+    }
     else
     {
       error= tl->table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
@@ -488,28 +499,30 @@ bool simple_pred(Item_func *func_item, Item **args, bool *inv_order)
     /* MULT_EQUAL_FUNC */
     {
       Item_equal *item_equal= (Item_equal *) func_item;
-      Item_equal_iterator it(*item_equal);
-      args[0]= it++;
-      if (it++)
-        return 0;
       if (!(args[1]= item_equal->get_const()))
         return 0;
+      Item_equal_fields_iterator it(*item_equal);
+      if (!(item= it++))
+        return 0;
+      args[0]= item->real_item();
+      if (it++)
+        return 0;
     }
     break;
   case 1:
     /* field IS NULL */
-    item= func_item->arguments()[0];
+    item= func_item->arguments()[0]->real_item();
     if (item->type() != Item::FIELD_ITEM)
       return 0;
     args[0]= item;
     break;
   case 2:
     /* 'field op const' or 'const op field' */
-    item= func_item->arguments()[0];
+    item= func_item->arguments()[0]->real_item();
     if (item->type() == Item::FIELD_ITEM)
     {
       args[0]= item;
-      item= func_item->arguments()[1];
+      item= func_item->arguments()[1]->real_item();
       if (!item->const_item())
         return 0;
       args[1]= item;
@@ -517,7 +530,7 @@ bool simple_pred(Item_func *func_item, Item **args, bool *inv_order)
     else if (item->const_item())
     {
       args[1]= item;
-      item= func_item->arguments()[1];
+      item= func_item->arguments()[1]->real_item();
       if (item->type() != Item::FIELD_ITEM)
         return 0;
       args[0]= item;
@@ -528,13 +541,13 @@ bool simple_pred(Item_func *func_item, Item **args, bool *inv_order)
     break;
   case 3:
     /* field BETWEEN const AND const */
-    item= func_item->arguments()[0];
+    item= func_item->arguments()[0]->real_item();
     if (item->type() == Item::FIELD_ITEM)
     {
       args[0]= item;
       for (int i= 1 ; i <= 2; i++)
       {
-        item= func_item->arguments()[i];
+        item= func_item->arguments()[i]->real_item();
         if (!item->const_item())
           return 0;
         args[i]= item;
@@ -615,7 +628,7 @@ static bool matching_cond(bool max_fl, TABLE_REF *ref, KEY *keyinfo,
   if (!(cond->used_tables() & field->table->map))
   {
     /* Condition doesn't restrict the used table */
-    DBUG_RETURN(TRUE);
+    DBUG_RETURN(!cond->const_item());
   }
   if (cond->type() == Item::COND_ITEM)
   {
@@ -751,7 +764,7 @@ static bool matching_cond(bool max_fl, TABLE_REF *ref, KEY *keyinfo,
         since set_null will be ignored, and we will compare uninitialized data.
       */
       if (!part->field->real_maybe_null())
-        DBUG_RETURN(false);
+        DBUG_RETURN(FALSE);
       part->field->set_null();
       *key_ptr= (uchar) 1;
     }
@@ -839,7 +852,7 @@ static bool find_key_for_maxmin(bool max_fl, TABLE_REF *ref,
                                 uint *range_fl, uint *prefix_len)
 {
   if (!(field->flags & PART_KEY_FLAG))
-    return false;                               // Not key field
+    return FALSE;                               // Not key field
 
   DBUG_ENTER("find_key_for_maxmin");
 
@@ -866,7 +879,7 @@ static bool find_key_for_maxmin(bool max_fl, TABLE_REF *ref,
          part++, jdx++, key_part_to_use= (key_part_to_use << 1) | 1)
     {
       if (!(table->file->index_flags(idx, jdx, 0) & HA_READ_ORDER))
-        DBUG_RETURN(false);
+        DBUG_RETURN(FALSE);
 
       /* Check whether the index component is partial */
       Field *part_field= table->field[part->fieldnr-1];
@@ -915,12 +928,12 @@ static bool find_key_for_maxmin(bool max_fl, TABLE_REF *ref,
           */
           if (field->part_of_key.is_set(idx))
             table->enable_keyread();
-          DBUG_RETURN(true);
+          DBUG_RETURN(TRUE);
         }
       }
     }
   }
-  DBUG_RETURN(false);
+  DBUG_RETURN(FALSE);
 }
 
 
diff --git a/sql/opt_table_elimination.cc b/sql/opt_table_elimination.cc
index 33eeec46217..9ab6e0e84d2 100644
--- a/sql/opt_table_elimination.cc
+++ b/sql/opt_table_elimination.cc
@@ -1207,15 +1207,16 @@ void build_eq_mods_for_cond(Dep_analysis_context *ctx,
     if (!(fvl= new List<Dep_value_field>))
       break; /* purecov: inspected */
 
-    Item_equal_iterator it(*item_equal);
-    Item_field *item;
+    Item_equal_fields_iterator it(*item_equal);
+    Item *item;
     Item *bound_item= item_equal->get_const();
     while ((item= it++))
     {
+      Field *equal_field= it.get_curr_field();
       if ((item->used_tables() & ctx->usable_tables))
       {
         Dep_value_field *field_val;
-        if ((field_val= ctx->get_field_value(item->field)))
+        if ((field_val= ctx->get_field_value(equal_field)))
           fvl->push_back(field_val);
       }
       else
@@ -1231,7 +1232,7 @@ void build_eq_mods_for_cond(Dep_analysis_context *ctx,
     if (fvl->elements)
     {
       
-      exchange_sort<Dep_value_field>(fvl, compare_field_values, NULL);
+      bubble_sort<Dep_value_field>(fvl, compare_field_values, NULL);
       add_module_expr(ctx, eq_mod, *and_level, NULL, bound_item, fvl);
     }
     break;
@@ -1782,7 +1783,7 @@ static void mark_as_eliminated(JOIN *join, TABLE_LIST *tbl)
     JOIN_TAB *tab= tbl->table->reginfo.join_tab;
     if (!(join->const_table_map & tab->table->map))
     {
-      DBUG_PRINT("info", ("Eliminated table %s", table->alias));
+      DBUG_PRINT("info", ("Eliminated table %s", table->alias.c_ptr()));
       tab->type= JT_CONST;
       join->eliminated_tables |= table->map;
       join->const_table_map|= table->map;
@@ -1817,7 +1818,7 @@ void Dep_analysis_context::dbug_print_deps()
       fprintf(DBUG_FILE, "  equality%ld: %s -> %s.%s\n", 
               (long)(eq_mod - equality_mods),
               str.c_ptr(),
-              eq_mod->field->table->table->alias,
+              eq_mod->field->table->table->alias.c_ptr(),
               eq_mod->field->field->field_name);
     }
     else
@@ -1835,12 +1836,13 @@ void Dep_analysis_context::dbug_print_deps()
     if ((table_dep= table_deps[i]))
     {
       /* Print table */
-      fprintf(DBUG_FILE, "  table %s\n", table_dep->table->alias);
+      fprintf(DBUG_FILE, "  table %s\n", table_dep->table->alias.c_ptr());
       /* Print fields */
       for (Dep_value_field *field_dep= table_dep->fields; field_dep; 
            field_dep= field_dep->next_table_field)
       {
-        fprintf(DBUG_FILE, "    field %s.%s ->", table_dep->table->alias,
+        fprintf(DBUG_FILE, "    field %s.%s ->",
+                table_dep->table->alias.c_ptr(),
                 field_dep->field->field_name);
         uint ofs= field_dep->bitmap_offset;
         for (uint bit= ofs; bit < ofs + n_equality_mods; bit++)
diff --git a/sql/parse_file.cc b/sql/parse_file.cc
index 9989bc57b24..bffcfea3654 100644
--- a/sql/parse_file.cc
+++ b/sql/parse_file.cc
@@ -386,7 +386,7 @@ sql_parse_prepare(const LEX_STRING *file_name, MEM_ROOT *mem_root,
     DBUG_RETURN(0);
   }
 
-  if (!(parser->buff= (char*) alloc_root(mem_root, stat_info.st_size+1)))
+  if (!(parser->buff= (char*) alloc_root(mem_root, (size_t)(stat_info.st_size+1))))
   {
     DBUG_RETURN(0);
   }
diff --git a/sql/partition_element.h b/sql/partition_element.h
index cefc32ecac4..f6816cfecf0 100644
--- a/sql/partition_element.h
+++ b/sql/partition_element.h
@@ -103,6 +103,7 @@ public:
   char* data_file_name;
   char* index_file_name;
   handlerton *engine_type;
+  LEX_STRING connect_string;
   enum partition_state part_state;
   uint16 nodegroup_id;
   bool has_null_value;
@@ -119,6 +120,8 @@ public:
     nodegroup_id(UNDEF_NODEGROUP), has_null_value(FALSE),
     signed_flag(FALSE), max_value(FALSE)
   {
+    connect_string.str= 0;
+    connect_string.length= 0;
   }
   partition_element(partition_element *part_elem)
   : part_max_rows(part_elem->part_max_rows),
@@ -129,10 +132,13 @@ public:
     data_file_name(part_elem->data_file_name),
     index_file_name(part_elem->index_file_name),
     engine_type(part_elem->engine_type),
+    connect_string(part_elem->connect_string),
     part_state(part_elem->part_state),
     nodegroup_id(part_elem->nodegroup_id),
     has_null_value(FALSE)
   {
+    connect_string.str= 0;
+    connect_string.length= 0;
   }
   ~partition_element() {}
 };
diff --git a/sql/protocol.cc b/sql/protocol.cc
index c5937cee807..606bf4ccb00 100644
--- a/sql/protocol.cc
+++ b/sql/protocol.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011 Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -577,6 +578,56 @@ bool Protocol::send_error(uint sql_errno, const char *err_msg,
 }
 
 
+/**
+   Send a progress report to the client
+
+   What we send is:
+   header (255,255,255,1)
+   stage, max_stage as on byte integers
+   percentage withing the stage as percentage*1000
+   (that is, ratio*100000) as a 3 byte integer
+   proc_info as a string
+*/
+
+const uchar progress_header[2]= {(uchar) 255, (uchar) 255 };
+
+void net_send_progress_packet(THD *thd)
+{
+  uchar buff[200], *pos;
+  const char *proc_info= thd->proc_info ? thd->proc_info : "";
+  uint length= strlen(proc_info);
+  ulonglong progress;
+  DBUG_ENTER("net_send_progress_packet");
+
+  if (unlikely(!thd->net.vio))
+    DBUG_VOID_RETURN;                           // Socket is closed
+
+  pos= buff;
+  /*
+    Store number of strings first. This allows us to later expand the
+    progress indicator if needed.
+  */
+  *pos++= (uchar) 1;                            // Number of strings
+  *pos++= (uchar) thd->progress.stage + 1;
+  /*
+    We have the max() here to avoid problems if max_stage is not set,
+    which may happen during automatic repair of table
+  */
+  *pos++= (uchar) max(thd->progress.max_stage, thd->progress.stage + 1);
+  progress= 0;
+  if (thd->progress.max_counter)
+    progress= 100000ULL * thd->progress.counter / thd->progress.max_counter;
+  int3store(pos, progress);                          // Between 0 & 100000
+  pos+= 3;
+  pos= net_store_data(pos, (const uchar*) proc_info,
+                      min(length, sizeof(buff)-7));
+  net_write_command(&thd->net, (uchar) 255, progress_header,
+                    sizeof(progress_header), (uchar*) buff,
+                    (uint) (pos - buff));
+  DBUG_VOID_RETURN;
+}
+
+  
 /****************************************************************************
   Functions used by the protocol functions (like net_send_ok) to store
   strings and numbers in the header result packet.
@@ -679,7 +730,7 @@ bool Protocol::send_result_set_metadata(List<Item> *list, uint flags)
   Protocol_text prot(thd);
   String *local_packet= prot.storage_packet();
   CHARSET_INFO *thd_charset= thd->variables.character_set_results;
-  DBUG_ENTER("send_result_set_metadata");
+  DBUG_ENTER("Protocol::send_result_set_metadata");
 
   if (flags & SEND_NUM_ROWS)
   {				// Packet with number of elements
@@ -694,6 +745,9 @@ bool Protocol::send_result_set_metadata(List<Item> *list, uint flags)
   uint count= 0;
 #endif
 
+  /* We have to reallocate it here as a stored procedure may have reset it */
+  (void) local_packet->alloc(thd->variables.net_buffer_length);
+
   while ((item=it++))
   {
     char *pos;
@@ -1121,13 +1175,7 @@ bool Protocol_text::store(Field *field)
 }
 
 
-/**
-  @todo
-    Second_part format ("%06") needs to change when 
-    we support 0-6 decimals for time.
-*/
-
-bool Protocol_text::store(MYSQL_TIME *tm)
+bool Protocol_text::store(MYSQL_TIME *tm, int decimals)
 {
 #ifndef DBUG_OFF
   DBUG_ASSERT(field_types == 0 ||
@@ -1135,18 +1183,8 @@ bool Protocol_text::store(MYSQL_TIME *tm)
 	      field_types[field_pos] == MYSQL_TYPE_TIMESTAMP);
   field_pos++;
 #endif
-  char buff[40];
-  uint length;
-  length= sprintf(buff, "%04d-%02d-%02d %02d:%02d:%02d",
-		  (int) tm->year,
-		  (int) tm->month,
-		  (int) tm->day,
-		  (int) tm->hour,
-		  (int) tm->minute,
-		  (int) tm->second);
-  if (tm->second_part)
-    length+= sprintf(buff+length, ".%06d",
-                     (int)tm->second_part);
+  char buff[MAX_DATE_STRING_REP_LENGTH];
+  uint length= my_datetime_to_str(tm, buff, decimals);
   return net_store_data((uchar*) buff, length);
 }
 
@@ -1164,29 +1202,15 @@ bool Protocol_text::store_date(MYSQL_TIME *tm)
 }
 
 
-/**
-  @todo 
-    Second_part format ("%06") needs to change when 
-    we support 0-6 decimals for time.
-*/
-
-bool Protocol_text::store_time(MYSQL_TIME *tm)
+bool Protocol_text::store_time(MYSQL_TIME *tm, int decimals)
 {
 #ifndef DBUG_OFF
   DBUG_ASSERT(field_types == 0 ||
 	      field_types[field_pos] == MYSQL_TYPE_TIME);
   field_pos++;
 #endif
-  char buff[40];
-  uint length;
-  uint day= (tm->year || tm->month) ? 0 : tm->day;
-  length= sprintf(buff, "%s%02ld:%02d:%02d",
-                  tm->neg ? "-" : "",
-                  (long) day*24L+(long) tm->hour,
-                  (int) tm->minute,
-                  (int) tm->second);
-  if (tm->second_part)
-    length+= sprintf(buff+length, ".%06d", (int)tm->second_part);
+  char buff[MAX_DATE_STRING_REP_LENGTH];
+  uint length= my_time_to_str(tm, buff, decimals);
   return net_store_data((uchar*) buff, length);
 }
 
@@ -1389,7 +1413,7 @@ bool Protocol_binary::store(Field *field)
 }
 
 
-bool Protocol_binary::store(MYSQL_TIME *tm)
+bool Protocol_binary::store(MYSQL_TIME *tm, int decimals)
 {
   char buff[12],*pos;
   uint length;
@@ -1402,6 +1426,10 @@ bool Protocol_binary::store(MYSQL_TIME *tm)
   pos[4]= (uchar) tm->hour;
   pos[5]= (uchar) tm->minute;
   pos[6]= (uchar) tm->second;
+  DBUG_ASSERT(decimals == AUTO_SEC_PART_DIGITS ||
+              (decimals >= 0 && decimals <= TIME_SECOND_PART_DIGITS));
+  if (decimals != AUTO_SEC_PART_DIGITS)
+    tm->second_part= sec_part_truncate(tm->second_part, decimals);
   int4store(pos+7, tm->second_part);
   if (tm->second_part)
     length=11;
@@ -1419,11 +1447,11 @@ bool Protocol_binary::store_date(MYSQL_TIME *tm)
 {
   tm->hour= tm->minute= tm->second=0;
   tm->second_part= 0;
-  return Protocol_binary::store(tm);
+  return Protocol_binary::store(tm, 0);
 }
 
 
-bool Protocol_binary::store_time(MYSQL_TIME *tm)
+bool Protocol_binary::store_time(MYSQL_TIME *tm, int decimals)
 {
   char buff[13], *pos;
   uint length;
@@ -1432,7 +1460,6 @@ bool Protocol_binary::store_time(MYSQL_TIME *tm)
   pos[0]= tm->neg ? 1 : 0;
   if (tm->hour >= 24)
   {
-    /* Fix if we come from Item::send */
     uint days= tm->hour/24;
     tm->hour-= days*24;
     tm->day+= days;
@@ -1441,6 +1468,10 @@ bool Protocol_binary::store_time(MYSQL_TIME *tm)
   pos[5]= (uchar) tm->hour;
   pos[6]= (uchar) tm->minute;
   pos[7]= (uchar) tm->second;
+  DBUG_ASSERT(decimals == AUTO_SEC_PART_DIGITS ||
+              (decimals >= 0 && decimals <= TIME_SECOND_PART_DIGITS));
+  if (decimals != AUTO_SEC_PART_DIGITS)
+    tm->second_part= sec_part_truncate(tm->second_part, decimals);
   int4store(pos+8, tm->second_part);
   if (tm->second_part)
     length=12;
diff --git a/sql/protocol.h b/sql/protocol.h
index 1c86c6d6c49..7d99901ab44 100644
--- a/sql/protocol.h
+++ b/sql/protocol.h
@@ -109,9 +109,9 @@ public:
   		     CHARSET_INFO *fromcs, CHARSET_INFO *tocs)=0;
   virtual bool store(float from, uint32 decimals, String *buffer)=0;
   virtual bool store(double from, uint32 decimals, String *buffer)=0;
-  virtual bool store(MYSQL_TIME *time)=0;
+  virtual bool store(MYSQL_TIME *time, int decimals)=0;
   virtual bool store_date(MYSQL_TIME *time)=0;
-  virtual bool store_time(MYSQL_TIME *time)=0;
+  virtual bool store_time(MYSQL_TIME *time, int decimals)=0;
   virtual bool store(Field *field)=0;
 
   virtual bool send_out_parameters(List<Item_param> *sp_params)=0;
@@ -152,9 +152,9 @@ public:
   virtual bool store(const char *from, size_t length, CHARSET_INFO *cs);
   virtual bool store(const char *from, size_t length,
   		     CHARSET_INFO *fromcs, CHARSET_INFO *tocs);
-  virtual bool store(MYSQL_TIME *time);
+  virtual bool store(MYSQL_TIME *time, int decimals);
   virtual bool store_date(MYSQL_TIME *time);
-  virtual bool store_time(MYSQL_TIME *time);
+  virtual bool store_time(MYSQL_TIME *time, int decimals);
   virtual bool store(float nr, uint32 decimals, String *buffer);
   virtual bool store(double from, uint32 decimals, String *buffer);
   virtual bool store(Field *field);
@@ -189,9 +189,9 @@ public:
   virtual bool store(const char *from, size_t length, CHARSET_INFO *cs);
   virtual bool store(const char *from, size_t length,
   		     CHARSET_INFO *fromcs, CHARSET_INFO *tocs);
-  virtual bool store(MYSQL_TIME *time);
+  virtual bool store(MYSQL_TIME *time, int decimals);
   virtual bool store_date(MYSQL_TIME *time);
-  virtual bool store_time(MYSQL_TIME *time);
+  virtual bool store_time(MYSQL_TIME *time, int decimals);
   virtual bool store(float nr, uint32 decimals, String *buffer);
   virtual bool store(double from, uint32 decimals, String *buffer);
   virtual bool store(Field *field);
@@ -204,6 +204,7 @@ public:
 void send_warning(THD *thd, uint sql_errno, const char *err=0);
 bool net_send_error(THD *thd, uint sql_errno, const char *err,
                     const char* sqlstate);
+void net_send_progress_packet(THD *thd);
 uchar *net_store_data(uchar *to,const uchar *from, size_t length);
 uchar *net_store_data(uchar *to,int32 from);
 uchar *net_store_data(uchar *to,longlong from);
diff --git a/sql/records.cc b/sql/records.cc
index 5709eaf1df2..d688ee675e7 100644
--- a/sql/records.cc
+++ b/sql/records.cc
@@ -207,15 +207,6 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table,
 
   if (select && my_b_inited(&select->file))
     tempfile= &select->file;
-  else if (select && select->quick && select->quick->clustered_pk_range())
-  {
-    /*
-      In case of QUICK_INDEX_MERGE_SELECT with clustered pk range we have to
-      use its own access method(i.e QUICK_INDEX_MERGE_SELECT::get_next()) as
-      sort file does not contain rowids which satisfy clustered pk range.
-    */
-    tempfile= 0;
-  }
   else
     tempfile= table->sort.io_cache;
   if (tempfile && my_b_inited(tempfile) &&
@@ -312,7 +303,8 @@ void end_read_record(READ_RECORD *info)
   if (info->table)
   {
     filesort_free_buffers(info->table,0);
-    (void) info->file->extra(HA_EXTRA_NO_CACHE);
+    if (info->table->created)
+      (void) info->file->extra(HA_EXTRA_NO_CACHE);
     if (info->read_record != rr_quick) // otherwise quick_range does it
       (void) info->file->ha_index_or_rnd_end();
     info->table=0;
diff --git a/sql/repl_failsafe.cc b/sql/repl_failsafe.cc
index bc710616a4c..9cf96e84928 100644
--- a/sql/repl_failsafe.cc
+++ b/sql/repl_failsafe.cc
@@ -217,7 +217,6 @@ void end_slave_list()
   }
 }
 
-
 /**
   Execute a SHOW SLAVE HOSTS statement.
 
diff --git a/sql/rpl_mi.cc b/sql/rpl_mi.cc
index 776bb95c4e5..705d3a8e450 100644
--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
@@ -26,8 +26,9 @@
 
 Master_info::Master_info(bool is_slave_recovery)
   :Slave_reporting_capability("I/O"),
-   ssl(0), ssl_verify_server_cert(0), fd(-1), io_thd(0), 
+   ssl(0), ssl_verify_server_cert(1), fd(-1), io_thd(0), 
    rli(is_slave_recovery), port(MYSQL_PORT),
+   checksum_alg_before_fd(BINLOG_CHECKSUM_ALG_UNDEF),
    connect_retry(DEFAULT_CONNECT_RETRY), inited(0), abort_slave(0),
    slave_running(0), slave_run_id(0), sync_counter(0),
    heartbeat_period(0), received_heartbeats(0), master_id(0)
diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
index 22ac6527c18..d9ba21eccc7 100644
--- a/sql/rpl_mi.h
+++ b/sql/rpl_mi.h
@@ -84,6 +84,12 @@ class Master_info : public Slave_reporting_capability
   uint32 file_id;				/* for 3.23 load data infile */
   Relay_log_info rli;
   uint port;
+  /*
+    to hold checksum alg in use until IO thread has received FD.
+    Initialized to novalue, then set to the queried from master
+    @@global.binlog_checksum and deactivated once FD has been received.
+  */
+  uint8 checksum_alg_before_fd;
   uint connect_retry;
 #ifndef DBUG_OFF
   int events_till_disconnect;
diff --git a/sql/rpl_record.cc b/sql/rpl_record.cc
index 5a7984430ea..cc3d16bdf85 100644
--- a/sql/rpl_record.cc
+++ b/sql/rpl_record.cc
@@ -1,4 +1,4 @@
-/* Copyright 2007 MySQL AB. All rights reserved.
+/* Copyright 2007 MySQL AB.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -102,7 +102,7 @@ pack_row(TABLE *table, MY_BITMAP const* cols,
         const uchar *old_pack_ptr= pack_ptr;
 #endif
         pack_ptr= field->pack(pack_ptr, field->ptr + offset,
-                              field->max_data_length(), TRUE);
+                              field->max_data_length());
         DBUG_PRINT("debug", ("field: %s; real_type: %d, pack_ptr: 0x%lx;"
                              " pack_ptr':0x%lx; bytes: %d",
                              field->field_name, field->real_type(),
@@ -303,7 +303,7 @@ unpack_row(Relay_log_info const *rli,
 #ifndef DBUG_OFF
         uchar const *const old_pack_ptr= pack_ptr;
 #endif
-        pack_ptr= f->unpack(f->ptr, pack_ptr, metadata, TRUE);
+        pack_ptr= f->unpack(f->ptr, pack_ptr, metadata);
 	DBUG_PRINT("debug", ("field: %s; metadata: 0x%x;"
                              " pack_ptr: 0x%lx; pack_ptr': 0x%lx; bytes: %d",
                              f->field_name, metadata,
diff --git a/sql/rpl_record.h b/sql/rpl_record.h
index 104d77738f1..b7c99c3955c 100644
--- a/sql/rpl_record.h
+++ b/sql/rpl_record.h
@@ -1,4 +1,4 @@
-/* Copyright 2007 MySQL AB. All rights reserved.
+/* Copyright 2007 MySQL AB.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/rpl_record_old.h b/sql/rpl_record_old.h
index 71c0ccc17b9..d0a9bbcb263 100644
--- a/sql/rpl_record_old.h
+++ b/sql/rpl_record_old.h
@@ -1,4 +1,4 @@
-/* Copyright 2007 MySQL AB. All rights reserved.
+/* Copyright 2007 MySQL AB.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
index 61cf39a3e86..63ce14c3f89 100644
--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -52,7 +52,8 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery)
    inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
    until_log_pos(0), retried_trans(0),
    tables_to_lock(0), tables_to_lock_count(0),
-   last_event_start_time(0), m_flags(0)
+   last_event_start_time(0), m_flags(0),
+   m_annotate_event(0)
 {
   DBUG_ENTER("Relay_log_info::Relay_log_info");
 
@@ -95,6 +96,7 @@ Relay_log_info::~Relay_log_info()
   mysql_cond_destroy(&stop_cond);
   mysql_cond_destroy(&log_space_cond);
   relay_log.cleanup();
+  free_annotate_event();
   DBUG_VOID_RETURN;
 }
 
@@ -193,9 +195,13 @@ a file name for --relay-log-index option", opt_relaylog_index_name);
                         " so replication "
                         "may break when this MySQL server acts as a "
                         "slave and has his hostname changed!! Please "
-                        "use '--relay-log=%s' to avoid this problem.", ln);
+                        "use '--log-basename=#' or '--relay-log=%s' to avoid "
+                        "this problem.", ln);
       name_warning_sent= 1;
     }
+
+    rli->relay_log.is_relay_log= TRUE;
+
     /*
       note, that if open() fails, we'll still have index file open
       but a destructor will take care of that
@@ -209,7 +215,6 @@ a file name for --relay-log-index option", opt_relaylog_index_name);
       sql_print_error("Failed in open_log() called from init_relay_log_info()");
       DBUG_RETURN(1);
     }
-    rli->relay_log.is_relay_log= TRUE;
   }
 
   /* if file does not exist */
@@ -564,8 +569,9 @@ int init_relay_log_pos(Relay_log_info* rli,const char* log,
         Because of we have rli->data_lock and log_lock, we can safely read an
         event
       */
-      if (!(ev=Log_event::read_log_event(rli->cur_log,0,
-                                         rli->relay_log.description_event_for_exec)))
+      if (!(ev= Log_event::read_log_event(rli->cur_log, 0,
+                                          rli->relay_log.description_event_for_exec,
+                                          opt_slave_sql_verify_checksum)))
       {
         DBUG_PRINT("info",("could not read event, rli->cur_log->error=%d",
                            rli->cur_log->error));
@@ -1203,11 +1209,8 @@ void Relay_log_info::stmt_done(my_off_t event_master_log_pos,
       is that value may take some time to display in
       Seconds_Behind_Master - not critical).
     */
-#ifndef DBUG_OFF
-    if (!(event_creation_time == 0 && debug_not_change_ts_if_art_event > 0))
-#else
-      if (event_creation_time != 0)
-#endif
+    if (!(event_creation_time == 0 &&
+          IF_DBUG(debug_not_change_ts_if_art_event > 0, 1)))
         last_master_timestamp= event_creation_time;
   }
 }
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index 681832a4c36..9cea56dc907 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -21,6 +21,7 @@
 #include "rpl_utility.h"
 #include "log.h"                         /* LOG_INFO, MYSQL_BIN_LOG */
 #include "sql_class.h"                   /* THD */
+#include "log_event.h"
 
 struct RPL_TABLE_LIST;
 class Master_info;
@@ -452,8 +453,46 @@ public:
       (m_flags & (1UL << IN_STMT));
   }
 
+  /**
+    Save pointer to Annotate_rows event and switch on the
+    binlog_annotate_rows_events for this sql thread.
+    To be called when sql thread recieves an Annotate_rows event.
+  */
+  inline void set_annotate_event(Annotate_rows_log_event *event)
+  {
+    free_annotate_event();
+    m_annotate_event= event;
+    sql_thd->variables.binlog_annotate_rows_events= 1;
+  }
+
+  /**
+    Returns pointer to the saved Annotate_rows event or NULL if there is
+    no saved event.
+  */
+  inline Annotate_rows_log_event* get_annotate_event()
+  {
+    return m_annotate_event;
+  }
+
+  /**
+    Delete saved Annotate_rows event (if any) and switch off the
+    binlog_annotate_rows_events for this sql thread.
+    To be called when sql thread has applied the last (i.e. with
+    STMT_END_F flag) rbr event.
+  */
+  inline void free_annotate_event()
+  {
+    if (m_annotate_event)
+    {
+      sql_thd->variables.binlog_annotate_rows_events= 0;
+      delete m_annotate_event;
+      m_annotate_event= 0;
+    }
+  }
+
 private:
   uint32 m_flags;
+  Annotate_rows_log_event *m_annotate_event;
 };
 
 
diff --git a/sql/rpl_utility.cc b/sql/rpl_utility.cc
index 6f66905eb5d..23a62b9a532 100644
--- a/sql/rpl_utility.cc
+++ b/sql/rpl_utility.cc
@@ -303,7 +303,7 @@ uint32 table_def::calc_field_size(uint col, uchar *master_data) const
       always read the length in little-endian order.
     */
     Field_blob fb(m_field_metadata[col]);
-    length= fb.get_packed_size(master_data, TRUE);
+    length= fb.get_packed_size(master_data);
 #else
     /*
       Compute the length of the data. We cannot use get_length() here
@@ -1056,3 +1056,58 @@ table_def::~table_def()
 #endif
 }
 
+/**
+   @param   even_buf    point to the buffer containing serialized event
+   @param   event_len   length of the event accounting possible checksum alg
+
+   @return  TRUE        if test fails
+            FALSE       as success
+*/
+bool event_checksum_test(uchar *event_buf, ulong event_len, uint8 alg)
+{
+  bool res= FALSE;
+  uint16 flags= 0; // to store in FD's buffer flags orig value
+
+  if (alg != BINLOG_CHECKSUM_ALG_OFF && alg != BINLOG_CHECKSUM_ALG_UNDEF)
+  {
+    ha_checksum incoming;
+    ha_checksum computed;
+
+    if (event_buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT)
+    {
+#ifndef DBUG_OFF
+      int8 fd_alg= event_buf[event_len - BINLOG_CHECKSUM_LEN - 
+                             BINLOG_CHECKSUM_ALG_DESC_LEN];
+#endif
+      /*
+        FD event is checksummed and therefore verified w/o the binlog-in-use flag
+      */
+      flags= uint2korr(event_buf + FLAGS_OFFSET);
+      if (flags & LOG_EVENT_BINLOG_IN_USE_F)
+        event_buf[FLAGS_OFFSET] &= ~LOG_EVENT_BINLOG_IN_USE_F;
+      /* 
+         The only algorithm currently is CRC32. Zero indicates 
+         the binlog file is checksum-free *except* the FD-event.
+      */
+      DBUG_ASSERT(fd_alg == BINLOG_CHECKSUM_ALG_CRC32 || fd_alg == 0);
+      DBUG_ASSERT(alg == BINLOG_CHECKSUM_ALG_CRC32);
+      /*
+        Complile time guard to watch over  the max number of alg
+      */
+      compile_time_assert(BINLOG_CHECKSUM_ALG_ENUM_END <= 0x80);
+    }
+    incoming= uint4korr(event_buf + event_len - BINLOG_CHECKSUM_LEN);
+    computed= my_checksum(0L, NULL, 0);
+    /* checksum the event content but the checksum part itself */
+    computed= my_checksum(computed, (const uchar*) event_buf, 
+                          event_len - BINLOG_CHECKSUM_LEN);
+    if (flags != 0)
+    {
+      /* restoring the orig value of flags of FD */
+      DBUG_ASSERT(event_buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT);
+      event_buf[FLAGS_OFFSET]= (uchar) flags;
+    }
+    res= !(computed == incoming);
+  }
+  return DBUG_EVALUATE_IF("simulate_checksum_test_failure", TRUE, res);
+}
diff --git a/sql/set_var.cc b/sql/set_var.cc
index 45b630a3ba9..a229002282b 100644
--- a/sql/set_var.cc
+++ b/sql/set_var.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2002, 2010, Oracle and/or its affiliates.
+   2009-2010 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/set_var.h b/sql/set_var.h
index af96fabe3dc..d34b1db8b71 100644
--- a/sql/set_var.h
+++ b/sql/set_var.h
@@ -1,6 +1,6 @@
 #ifndef SET_VAR_INCLUDED
 #define SET_VAR_INCLUDED
-/* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2002, 2010, Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
index c86451b89e9..86700136d6c 100644
--- a/sql/share/errmsg-utf8.txt
+++ b/sql/share/errmsg-utf8.txt
@@ -5313,7 +5313,7 @@ ER_NO_DEFAULT_FOR_FIELD
 ER_DIVISION_BY_ZERO 22012 
         eng "Division by 0"
         ger "Division durch 0"
-ER_TRUNCATED_WRONG_VALUE_FOR_FIELD  
+ER_TRUNCATED_WRONG_VALUE_FOR_FIELD  22007
         eng "Incorrect %-.32s value: '%-.128s' for column '%.192s' at row %lu"
         ger "Falscher %-.32s-Wert: '%-.128s' für Feld '%.192s' in Zeile %lu"
 ER_ILLEGAL_VALUE_FOR_TYPE 22007 
@@ -5505,11 +5505,11 @@ ER_SP_NO_RECURSION
         eng "Recursive stored functions and triggers are not allowed."
         ger "Rekursive gespeicherte Routinen und Triggers sind nicht erlaubt"
 ER_TOO_BIG_SCALE 42000 S1009
-        eng "Too big scale %d specified for column '%-.192s'. Maximum is %lu."
-        ger "Zu großer Skalierungsfaktor %d für Feld '%-.192s' angegeben. Maximum ist %lu"
+        eng "Too big scale %u specified for '%-.192s'. Maximum is %lu."
+        ger "Zu großer Skalierungsfaktor %u für '%-.192s' angegeben. Maximum ist %lu"
 ER_TOO_BIG_PRECISION 42000 S1009
-        eng "Too big precision %d specified for column '%-.192s'. Maximum is %lu."
-        ger "Zu große Genauigkeit %d für Feld '%-.192s' angegeben. Maximum ist %lu"
+        eng "Too big precision %u specified for '%-.192s'. Maximum is %lu."
+        ger "Zu große Genauigkeit %u für '%-.192s' angegeben. Maximum ist %lu"
 ER_M_BIGGER_THAN_D 42000 S1009
         eng "For float(M,D), double(M,D) or decimal(M,D), M must be >= D (column '%-.192s')."
         ger "Für FLOAT(M,D), DOUBLE(M,D) oder DECIMAL(M,D) muss M >= D sein (Feld '%-.192s')"
@@ -5547,8 +5547,8 @@ ER_WARN_CANT_DROP_DEFAULT_KEYCACHE
         eng "Cannot drop default keycache"
         ger "Der vorgabemäßige Schlüssel-Cache kann nicht gelöscht werden"
 ER_TOO_BIG_DISPLAYWIDTH 42000 S1009
-        eng "Display width out of range for column '%-.192s' (max = %lu)"
-        ger "Anzeigebreite außerhalb des zulässigen Bereichs für Spalte '%-.192s' (Maximum: %lu)"
+        eng "Display width out of range for '%-.192s' (max = %lu)"
+        ger "Anzeigebreite außerhalb des zulässigen Bereichs für '%-.192s' (Maximum: %lu)"
 ER_XAER_DUPID XAE08
         eng "XAER_DUPID: The XID already exists"
         ger "XAER_DUPID: Die XID existiert bereits"
@@ -6009,7 +6009,7 @@ ER_ONLY_INTEGERS_ALLOWED
         eng "Only integers allowed as number here"
         ger "An dieser Stelle sind nur Ganzzahlen zulässig"
 ER_UNSUPORTED_LOG_ENGINE
-        eng "This storage engine cannot be used for log tables""
+        eng "This storage engine cannot be used for log tables"
         ger "Diese Speicher-Engine kann für Logtabellen nicht verwendet werden"
 ER_BAD_LOG_STATEMENT
         eng "You cannot '%s' a log table if logging is enabled"
@@ -6415,6 +6415,19 @@ ER_ERROR_IN_TRIGGER_BODY
 ER_ERROR_IN_UNKNOWN_TRIGGER_BODY
   eng "Unknown trigger has an error in its body: '%-.256s'"
 
+#
+# MariaDB error messages section starts here
+#
+
+# The following is here to allow us to detect if there was missing
+# error messages in the errmsg.sys file
+
+ER_LAST_MYSQL_ERROR_MESSAGE
+   eng ""
+
+# MariaDB error numbers starts from 1900
+start-error-number 1900
+
 ER_VCOL_BASED_ON_VCOL
   eng "A computed column cannot be based on a computed column"
 ER_VIRTUAL_COLUMN_FUNCTION_IS_NOT_ALLOWED
@@ -6430,13 +6443,42 @@ ER_WRONG_FK_OPTION_FOR_VIRTUAL_COLUMN
 ER_WARNING_NON_DEFAULT_VALUE_FOR_VIRTUAL_COLUMN
   eng "The value specified for computed column '%s' in table '%s' ignored"
 ER_UNSUPPORTED_ACTION_ON_VIRTUAL_COLUMN
-  eng "'%s' is not yet supported for computed columns"
+  eng "This is not yet supported for computed columns"
 ER_CONST_EXPR_IN_VCOL
   eng "Constant expression in computed column function is not allowed"
 ER_ROW_EXPR_FOR_VCOL
   eng "Expression for computed column cannot return a row"
-
+ER_UNSUPPORTED_ENGINE_FOR_VIRTUAL_COLUMNS
+        eng "%s storage engine does not support computed columns"
 ER_UNKNOWN_OPTION
   eng "Unknown option '%-.64s'"
 ER_BAD_OPTION_VALUE
   eng "Incorrect value '%-.64s' for option '%-.64s'"
+ER_NETWORK_READ_EVENT_CHECKSUM_FAILURE
+  eng "Replication event checksum verification failed while reading from network."
+ER_BINLOG_READ_EVENT_CHECKSUM_FAILURE
+  eng "Replication event checksum verification failed while reading from a log file."
+ER_CANT_DO_ONLINE
+        eng "Can't execute the given '%s' command as online"
+ER_DATA_OVERFLOW 22003
+        eng "Got overflow when converting '%-.128s' to %-.32s. Value truncated."
+ER_DATA_TRUNCATED 22003
+        eng "Truncated value '%-.128s' when converting to %-.32s"
+ER_BAD_DATA 22007
+        eng "Encountered illegal value '%-.128s' when converting to %-.32s"
+ER_DYN_COL_WRONG_FORMAT
+        eng "Encountered illegal format of dynamic column string"
+ER_DYN_COL_IMPLEMENTATION_LIMIT
+        eng "Dynamic column implementation limit reached"
+ER_DYN_COL_DATA 22007
+        eng "Illegal value used as argument of dynamic column function"
+ER_DYN_COL_WRONG_CHARSET
+        eng "Dynamic column contains unknown character set"
+ER_ILLEGAL_SUBQUERY_OPTIMIZER_SWITCHES
+        eng "At least one of the 'in_to_exists' or 'materialization' optimizer_switch flags must be 'on'."
+ER_QUERY_CACHE_IS_DISABLED
+        eng "Query cache is disabled (resize or similar command in progress); repeat this command later"
+ER_QUERY_CACHE_IS_GLOBALY_DISABLED
+        eng "Query cache is globally disabled and you can't enable it only for this session"
+ER_VIEW_ORDERBY_IGNORED
+        eng "View '%-.192s'.'%-.192s' ORDER BY clause ignored because there is other ORDER BY clause already."
diff --git a/sql/slave.cc b/sql/slave.cc
index 1d0781aa7b2..4d8759b1c29 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (C) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1289,6 +1290,48 @@ static int get_master_version_and_clock(MYSQL* mysql, Master_info* mi)
   }
 
   /*
+    FD_q's (A) is set initially from RL's (A): FD_q.(A) := RL.(A).
+    It's necessary to adjust FD_q.(A) at this point because in the following
+    course FD_q is going to be dumped to RL.
+    Generally FD_q is derived from a received FD_m (roughly FD_q := FD_m) 
+    in queue_event and the master's (A) is installed.
+    At one step with the assignment the Relay-Log's checksum alg is set to 
+    a new value: RL.(A) := FD_q.(A). If the slave service is stopped
+    the last time assigned RL.(A) will be passed over to the restarting
+    service (to the current execution point).
+    RL.A is a "codec" to verify checksum in queue_event() almost all the time
+    the first fake Rotate event.
+    Starting from this point IO thread will executes the following checksum
+    warmup sequence  of actions:
+
+    FD_q.A := RL.A,
+    A_m^0 := master.@@global.binlog_checksum,
+    {queue_event(R_f): verifies(R_f, A_m^0)},
+    {queue_event(FD_m): verifies(FD_m, FD_m.A), dump(FD_q), rotate(RL),
+                        FD_q := FD_m, RL.A := FD_q.A)}
+
+    See legends definition on MYSQL_BIN_LOG::relay_log_checksum_alg
+    docs lines (binlog.h).
+    In above A_m^0 - the value of master's
+    @@binlog_checksum determined in the upcoming handshake (stored in
+    mi->checksum_alg_before_fd).
+
+
+    After the warm-up sequence IO gets to "normal" checksum verification mode
+    to use RL.A in 
+    
+    {queue_event(E_m): verifies(E_m, RL.A)}
+
+    until it has received a new FD_m.
+  */
+  mi->rli.relay_log.description_event_for_queue->checksum_alg=
+    mi->rli.relay_log.relay_log_checksum_alg;
+
+  DBUG_ASSERT(mi->rli.relay_log.description_event_for_queue->checksum_alg !=
+              BINLOG_CHECKSUM_ALG_UNDEF);
+  DBUG_ASSERT(mi->rli.relay_log.relay_log_checksum_alg !=
+              BINLOG_CHECKSUM_ALG_UNDEF); 
+  /*
     Compare the master and slave's clock. Do not die if master's clock is
     unavailable (very old master not supporting UNIX_TIMESTAMP()?).
   */
@@ -1566,6 +1609,103 @@ when it try to get the value of TIME_ZONE global variable from master.";
     mysql_free_result(mysql_store_result(mysql));
   }
  
+  /*
+    Querying if master is capable to checksum and notifying it about own
+    CRC-awareness. The master's side instant value of @@global.binlog_checksum 
+    is stored in the dump thread's uservar area as well as cached locally
+    to become known in consensus by master and slave.
+  */
+  DBUG_EXECUTE_IF("simulate_slave_unaware_checksum",
+                  mi->checksum_alg_before_fd= BINLOG_CHECKSUM_ALG_OFF;
+                  goto past_checksum;);
+  {
+    int rc;
+    const char query[]= "SET @master_binlog_checksum= @@global.binlog_checksum";
+    master_res= NULL;
+    mi->checksum_alg_before_fd= BINLOG_CHECKSUM_ALG_UNDEF; //initially undefined
+    /*
+      @c checksum_alg_before_fd is queried from master in this block.
+      If master is old checksum-unaware the value stays undefined.
+      Once the first FD will be received its alg descriptor will replace
+      the being queried one.
+    */
+    rc= mysql_real_query(mysql, query, strlen(query));
+    if (rc != 0)
+    {
+      if (check_io_slave_killed(mi->io_thd, mi, NULL))
+        goto slave_killed_err;
+
+      if (mysql_errno(mysql) == ER_UNKNOWN_SYSTEM_VARIABLE)
+      {
+        // this is tolerable as OM -> NS is supported
+        mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                   "Notifying master by %s failed with "
+                   "error: %s", query, mysql_error(mysql));
+      }
+      else
+      {
+        if (is_network_error(mysql_errno(mysql)))
+        {
+          mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                     "Notifying master by %s failed with "
+                     "error: %s", query, mysql_error(mysql));
+          mysql_free_result(mysql_store_result(mysql));
+          goto network_err;
+        }
+        else
+        {
+          errmsg= "The slave I/O thread stops because a fatal error is encountered "
+            "when it tried to SET @master_binlog_checksum on master.";
+          err_code= ER_SLAVE_FATAL_ERROR;
+          sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
+          mysql_free_result(mysql_store_result(mysql));
+          goto err;
+        }
+      }
+    }
+    else
+    {
+      mysql_free_result(mysql_store_result(mysql));
+      if (!mysql_real_query(mysql,
+                            STRING_WITH_LEN("SELECT @master_binlog_checksum")) &&
+          (master_res= mysql_store_result(mysql)) &&
+          (master_row= mysql_fetch_row(master_res)) &&
+          (master_row[0] != NULL))
+      {
+        mi->checksum_alg_before_fd= (uint8)
+          find_type(master_row[0], &binlog_checksum_typelib, 1) - 1;
+        // valid outcome is either of
+        DBUG_ASSERT(mi->checksum_alg_before_fd == BINLOG_CHECKSUM_ALG_OFF ||
+                    mi->checksum_alg_before_fd == BINLOG_CHECKSUM_ALG_CRC32);
+      }
+      else if (check_io_slave_killed(mi->io_thd, mi, NULL))
+        goto slave_killed_err;
+      else if (is_network_error(mysql_errno(mysql)))
+      {
+        mi->report(WARNING_LEVEL, mysql_errno(mysql),
+                   "Get master BINLOG_CHECKSUM failed with error: %s", mysql_error(mysql));
+        goto network_err;
+      }
+      else
+      {
+        errmsg= "The slave I/O thread stops because a fatal error is encountered "
+          "when it tried to SELECT @master_binlog_checksum.";
+        err_code= ER_SLAVE_FATAL_ERROR;
+        sprintf(err_buff, "%s Error: %s", errmsg, mysql_error(mysql));
+        mysql_free_result(mysql_store_result(mysql));
+        goto err;
+      }
+    }
+    if (master_res)
+    {
+      mysql_free_result(master_res);
+      master_res= NULL;
+    }
+  }
+
+#ifndef DBUG_OFF
+past_checksum:
+#endif
 
 err:
   if (errmsg)
@@ -1590,6 +1730,7 @@ slave_killed_err:
   DBUG_RETURN(2);
 }
 
+
 static bool wait_for_relay_log_space(Relay_log_info* rli)
 {
   bool slave_killed=0;
@@ -2128,6 +2269,9 @@ static int request_dump(THD *thd, MYSQL* mysql, Master_info* mi,
   
   *suppress_warnings= FALSE;
 
+  if (opt_log_slave_updates && opt_replicate_annotate_rows_events)
+    binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+
   if (RUN_HOOK(binlog_relay_io,
                before_request_transmit,
                (thd, mi, binlog_flags)))
@@ -2349,7 +2493,11 @@ int apply_event_and_update_pos(Log_event* ev, THD* thd, Relay_log_info* rli)
   thd->set_time();                            // time the query
   thd->lex->current_select= 0;
   if (!ev->when)
-    ev->when= my_time(0);
+  {
+    my_hrtime_t hrtime= my_hrtime();
+    ev->when= hrtime_to_my_time(hrtime);
+    ev->when_sec_part= hrtime_sec_part(hrtime);
+  }
   ev->thd = thd; // because up to this point, ev->thd == 0
 
   int reason= ev->shall_skip(rli);
@@ -2519,17 +2667,41 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
 
     exec_res= apply_event_and_update_pos(ev, thd, rli);
 
-    /*
-      Format_description_log_event should not be deleted because it will be
-      used to read info about the relay log's format; it will be deleted when
-      the SQL thread does not need it, i.e. when this thread terminates.
-    */
-    if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
-    {
-      DBUG_PRINT("info", ("Deleting the event after it has been executed"));
-      delete ev;
+    switch (ev->get_type_code()) {
+      case FORMAT_DESCRIPTION_EVENT:
+        /*
+          Format_description_log_event should not be deleted because it
+          will be used to read info about the relay log's format;
+          it will be deleted when the SQL thread does not need it,
+          i.e. when this thread terminates.
+        */
+        break;
+      case ANNOTATE_ROWS_EVENT:
+        /*
+          Annotate_rows event should not be deleted because after it has
+          been applied, thd->query points to the string inside this event.
+          The thd->query will be used to generate new Annotate_rows event
+          during applying the subsequent Rows events.
+        */
+        rli->set_annotate_event((Annotate_rows_log_event*) ev);
+        break;
+      case DELETE_ROWS_EVENT:
+      case UPDATE_ROWS_EVENT:
+      case WRITE_ROWS_EVENT:
+        /*
+          After the last Rows event has been applied, the saved Annotate_rows
+          event (if any) is not needed anymore and can be deleted.
+        */
+        if (((Rows_log_event*)ev)->get_flags(Rows_log_event::STMT_END_F))
+          rli->free_annotate_event();
+        /* fall through */
+      default:
+        DBUG_PRINT("info", ("Deleting the event after it has been executed"));
+        delete ev;
+        break;
     }
 
+
     /*
       update_log_pos failed: this should not happen, so we don't
       retry.
@@ -2539,7 +2711,8 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
 
     if (slave_trans_retries)
     {
-      int UNINIT_VAR(temp_err);
+      int temp_err;
+      LINT_INIT(temp_err);
       if (exec_res && (temp_err= has_temporary_error(thd)))
       {
         const char *errmsg;
@@ -3190,6 +3363,12 @@ pthread_handler_t handle_slave_sql(void *arg)
   thd->init_for_queries();
   thd->temporary_tables = rli->save_temporary_tables; // restore temp tables
   set_thd_in_use_temporary_tables(rli);   // (re)set sql_thd in use for saved temp tables
+  /*
+    binlog_annotate_rows_events must be TRUE only after an Annotate_rows event
+    has been recieved and only till the last corresponding rbr event has been
+    applied. In all other cases it must be FALSE.
+  */
+  thd->variables.binlog_annotate_rows_events= 0;
   mysql_mutex_lock(&LOCK_thread_count);
   threads.append(thd);
   mysql_mutex_unlock(&LOCK_thread_count);
@@ -3642,10 +3821,15 @@ static int process_io_rotate(Master_info *mi, Rotate_log_event *rev)
   */
   if (mi->rli.relay_log.description_event_for_queue->binlog_version >= 4)
   {
+    DBUG_ASSERT(mi->rli.relay_log.description_event_for_queue->checksum_alg ==
+                mi->rli.relay_log.relay_log_checksum_alg);
+    
     delete mi->rli.relay_log.description_event_for_queue;
     /* start from format 3 (MySQL 4.0) again */
     mi->rli.relay_log.description_event_for_queue= new
       Format_description_log_event(3);
+    mi->rli.relay_log.description_event_for_queue->checksum_alg=
+      mi->rli.relay_log.relay_log_checksum_alg;    
   }
   /*
     Rotate the relay log makes binlog format detection easier (at next slave
@@ -3672,7 +3856,7 @@ static int queue_binlog_ver_1_event(Master_info *mi, const char *buf,
     If we get Load event, we need to pass a non-reusable buffer
     to read_log_event, so we do a trick
   */
-  if (buf[EVENT_TYPE_OFFSET] == LOAD_EVENT)
+  if ((uchar)buf[EVENT_TYPE_OFFSET] == LOAD_EVENT)
   {
     if (unlikely(!(tmp_buf=(char*)my_malloc(event_len+1,MYF(MY_WME)))))
     {
@@ -3698,8 +3882,9 @@ static int queue_binlog_ver_1_event(Master_info *mi, const char *buf,
     Append_block/Exec_load (the SQL thread needs the data, as that thread is not
     connected to the master).
   */
-  Log_event *ev = Log_event::read_log_event(buf,event_len, &errmsg,
-                                            mi->rli.relay_log.description_event_for_queue);
+  Log_event *ev=
+    Log_event::read_log_event(buf, event_len, &errmsg,
+                              mi->rli.relay_log.description_event_for_queue, 0);
   if (unlikely(!ev))
   {
     sql_print_error("Read invalid event from master: '%s',\
@@ -3786,8 +3971,9 @@ static int queue_binlog_ver_3_event(Master_info *mi, const char *buf,
   DBUG_ENTER("queue_binlog_ver_3_event");
 
   /* read_log_event() will adjust log_pos to be end_log_pos */
-  Log_event *ev = Log_event::read_log_event(buf,event_len, &errmsg,
-                                            mi->rli.relay_log.description_event_for_queue);
+  Log_event *ev=
+    Log_event::read_log_event(buf,event_len, &errmsg,
+                              mi->rli.relay_log.description_event_for_queue, 0);
   if (unlikely(!ev))
   {
     sql_print_error("Read invalid event from master: '%s',\
@@ -3813,6 +3999,7 @@ static int queue_binlog_ver_3_event(Master_info *mi, const char *buf,
     inc_pos= event_len;
     break;
   }
+
   if (unlikely(rli->relay_log.append(ev)))
   {
     delete ev;
@@ -3876,18 +4063,79 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
   Relay_log_info *rli= &mi->rli;
   mysql_mutex_t *log_lock= rli->relay_log.get_log_lock();
   ulong s_id;
+  bool unlock_data_lock= TRUE;
+  /*
+    FD_q must have been prepared for the first R_a event
+    inside get_master_version_and_clock()
+    Show-up of FD:s affects checksum_alg at once because
+    that changes FD_queue.
+  */
+  uint8 checksum_alg= mi->checksum_alg_before_fd != BINLOG_CHECKSUM_ALG_UNDEF ? 
+    mi->checksum_alg_before_fd :
+    mi->rli.relay_log.relay_log_checksum_alg;
+
+  char *save_buf= NULL; // needed for checksumming the fake Rotate event
+  char rot_buf[LOG_EVENT_HEADER_LEN + ROTATE_HEADER_LEN + FN_REFLEN];
+
+  DBUG_ASSERT(checksum_alg == BINLOG_CHECKSUM_ALG_OFF || 
+              checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF || 
+              checksum_alg == BINLOG_CHECKSUM_ALG_CRC32); 
+
   DBUG_ENTER("queue_event");
+  /*
+    FD_queue checksum alg description does not apply in a case of
+    FD itself. The one carries both parts of the checksum data.
+  */
+  if (buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT)
+  {
+    checksum_alg= get_checksum_alg(buf, event_len);
+  }
+  else if (buf[EVENT_TYPE_OFFSET] == START_EVENT_V3)
+  {
+    // checksum behaviour is similar to the pre-checksum FD handling
+    mi->checksum_alg_before_fd= BINLOG_CHECKSUM_ALG_UNDEF;
+    mi->rli.relay_log.description_event_for_queue->checksum_alg=
+      mi->rli.relay_log.relay_log_checksum_alg= checksum_alg=
+      BINLOG_CHECKSUM_ALG_OFF;
+  }
+
+  // does not hold always because of old binlog can work with NM 
+  // DBUG_ASSERT(checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
+
+  // should hold unless manipulations with RL. Tests that do that
+  // will have to refine the clause.
+  DBUG_ASSERT(mi->rli.relay_log.relay_log_checksum_alg !=
+              BINLOG_CHECKSUM_ALG_UNDEF);
+              
+  // Emulate the network corruption
+  DBUG_EXECUTE_IF("corrupt_queue_event",
+    if (buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT)
+    {
+      char *debug_event_buf_c = (char*) buf;
+      int debug_cor_pos = rand() % (event_len - BINLOG_CHECKSUM_LEN);
+      debug_event_buf_c[debug_cor_pos] =~ debug_event_buf_c[debug_cor_pos];
+      DBUG_PRINT("info", ("Corrupt the event at queue_event: byte on position %d", debug_cor_pos));
+      DBUG_SET("-d,corrupt_queue_event");
+    }
+  );
+                                              
+  if (event_checksum_test((uchar *) buf, event_len, checksum_alg))
+  {
+    error= ER_NETWORK_READ_EVENT_CHECKSUM_FAILURE;
+    unlock_data_lock= FALSE;
+    goto err;
+  }
 
   LINT_INIT(inc_pos);
 
   if (mi->rli.relay_log.description_event_for_queue->binlog_version<4 &&
-      buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT /* a way to escape */)
+      (uchar)buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT /* a way to escape */)
     DBUG_RETURN(queue_old_event(mi,buf,event_len));
 
   LINT_INIT(inc_pos);
   mysql_mutex_lock(&mi->data_lock);
 
-  switch (buf[EVENT_TYPE_OFFSET]) {
+  switch ((uchar)buf[EVENT_TYPE_OFFSET]) {
   case STOP_EVENT:
     /*
       We needn't write this event to the relay log. Indeed, it just indicates a
@@ -3904,12 +4152,67 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
     goto err;
   case ROTATE_EVENT:
   {
-    Rotate_log_event rev(buf,event_len,mi->rli.relay_log.description_event_for_queue);
-    if (unlikely(process_io_rotate(mi,&rev)))
+    Rotate_log_event rev(buf, checksum_alg != BINLOG_CHECKSUM_ALG_OFF ?
+                         event_len - BINLOG_CHECKSUM_LEN : event_len,
+                         mi->rli.relay_log.description_event_for_queue);
+
+    if (unlikely(process_io_rotate(mi, &rev)))
     {
       error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
       goto err;
     }
+    /* 
+       Checksum special cases for the fake Rotate (R_f) event caused by the protocol
+       of events generation and serialization in RL where Rotate of master is 
+       queued right next to FD of slave.
+       Since it's only FD that carries the alg desc of FD_s has to apply to R_m.
+       Two special rules apply only to the first R_f which comes in before any FD_m.
+       The 2nd R_f should be compatible with the FD_s that must have taken over
+       the last seen FD_m's (A).
+       
+       RSC_1: If OM \and fake Rotate \and slave is configured to
+              to compute checksum for its first FD event for RL
+              the fake Rotate gets checksummed here.
+    */
+    if (uint4korr(&buf[0]) == 0 && checksum_alg == BINLOG_CHECKSUM_ALG_OFF &&
+        mi->rli.relay_log.relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_OFF)
+    {
+      ha_checksum rot_crc= my_checksum(0L, NULL, 0);
+      event_len += BINLOG_CHECKSUM_LEN;
+      memcpy(rot_buf, buf, event_len - BINLOG_CHECKSUM_LEN);
+      int4store(&rot_buf[EVENT_LEN_OFFSET],
+                uint4korr(&rot_buf[EVENT_LEN_OFFSET]) + BINLOG_CHECKSUM_LEN);
+      rot_crc= my_checksum(rot_crc, (const uchar *) rot_buf,
+                           event_len - BINLOG_CHECKSUM_LEN);
+      int4store(&rot_buf[event_len - BINLOG_CHECKSUM_LEN], rot_crc);
+      DBUG_ASSERT(event_len == uint4korr(&rot_buf[EVENT_LEN_OFFSET]));
+      DBUG_ASSERT(mi->rli.relay_log.description_event_for_queue->checksum_alg ==
+                  mi->rli.relay_log.relay_log_checksum_alg);
+      /* the first one */
+      DBUG_ASSERT(mi->checksum_alg_before_fd != BINLOG_CHECKSUM_ALG_UNDEF);
+      save_buf= (char *) buf;
+      buf= rot_buf;
+    }
+    else
+      /*
+        RSC_2: If NM \and fake Rotate \and slave does not compute checksum
+        the fake Rotate's checksum is stripped off before relay-logging.
+      */
+      if (uint4korr(&buf[0]) == 0 && checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+          mi->rli.relay_log.relay_log_checksum_alg == BINLOG_CHECKSUM_ALG_OFF)
+      {
+        event_len -= BINLOG_CHECKSUM_LEN;
+        memcpy(rot_buf, buf, event_len);
+        int4store(&rot_buf[EVENT_LEN_OFFSET],
+                  uint4korr(&rot_buf[EVENT_LEN_OFFSET]) - BINLOG_CHECKSUM_LEN);
+        DBUG_ASSERT(event_len == uint4korr(&rot_buf[EVENT_LEN_OFFSET]));
+        DBUG_ASSERT(mi->rli.relay_log.description_event_for_queue->checksum_alg ==
+                    mi->rli.relay_log.relay_log_checksum_alg);
+        /* the first one */
+        DBUG_ASSERT(mi->checksum_alg_before_fd != BINLOG_CHECKSUM_ALG_UNDEF);
+        save_buf= (char *) buf;
+        buf= rot_buf;
+      }
     /*
       Now the I/O thread has just changed its mi->master_log_name, so
       incrementing mi->master_log_pos is nonsense.
@@ -3930,15 +4233,24 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
     */
     Format_description_log_event* tmp;
     const char* errmsg;
+    // mark it as undefined that is irrelevant anymore
+    mi->checksum_alg_before_fd= BINLOG_CHECKSUM_ALG_UNDEF;
     if (!(tmp= (Format_description_log_event*)
           Log_event::read_log_event(buf, event_len, &errmsg,
-                                    mi->rli.relay_log.description_event_for_queue)))
+                                    mi->rli.relay_log.description_event_for_queue,
+                                    1)))
     {
       error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
       goto err;
     }
     delete mi->rli.relay_log.description_event_for_queue;
     mi->rli.relay_log.description_event_for_queue= tmp;
+    if (tmp->checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF)
+      tmp->checksum_alg= BINLOG_CHECKSUM_ALG_OFF;
+
+    /* installing new value of checksum Alg for relay log */
+    mi->rli.relay_log.relay_log_checksum_alg= tmp->checksum_alg;
+
     /*
        Though this does some conversion to the slave's format, this will
        preserve the master's binlog format version, and number of event types.
@@ -4082,13 +4394,16 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
       error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
     }
     rli->ign_master_log_name_end[0]= 0; // last event is not ignored
+    if (save_buf != NULL)
+      buf= save_buf;
   }
   mysql_mutex_unlock(log_lock);
 
 skip_relay_logging:
   
 err:
-  mysql_mutex_unlock(&mi->data_lock);
+  if (unlock_data_lock)
+    mysql_mutex_unlock(&mi->data_lock);
   DBUG_PRINT("info", ("error: %d", error));
   if (error)
     mi->report(ERROR_LEVEL, error, ER(error), 
@@ -4559,8 +4874,9 @@ static Log_event* next_event(Relay_log_info* rli)
       But if the relay log is created by new_file(): then the solution is:
       MYSQL_BIN_LOG::open() will write the buffered description event.
     */
-    if ((ev=Log_event::read_log_event(cur_log,0,
-                                      rli->relay_log.description_event_for_exec)))
+    if ((ev= Log_event::read_log_event(cur_log,0,
+                                       rli->relay_log.description_event_for_exec,
+                                       opt_slave_sql_verify_checksum)))
 
     {
       DBUG_ASSERT(thd==rli->sql_thd);
@@ -4962,9 +5278,9 @@ bool rpl_master_has_bug(const Relay_log_info *rli, uint bug_id, bool report,
     {37426, { 5, 1,  0 }, { 5, 1, 26 } },
   };
   const uchar *master_ver=
-    rli->relay_log.description_event_for_exec->server_version_split;
+    rli->relay_log.description_event_for_exec->server_version_split.ver;
 
-  DBUG_ASSERT(sizeof(rli->relay_log.description_event_for_exec->server_version_split) == 3);
+  DBUG_ASSERT(sizeof(rli->relay_log.description_event_for_exec->server_version_split.ver) == 3);
 
   for (uint i= 0;
        i < sizeof(versions_for_all_bugs)/sizeof(*versions_for_all_bugs);i++)
diff --git a/sql/slave.h b/sql/slave.h
index 9a6803d1ac9..9e24058cbbc 100644
--- a/sql/slave.h
+++ b/sql/slave.h
@@ -122,6 +122,7 @@ extern char *opt_relay_logname, *opt_relaylog_index_name;
 extern my_bool opt_skip_slave_start, opt_reckless_slave;
 extern my_bool opt_log_slave_updates;
 extern char *opt_slave_skip_errors;
+extern my_bool opt_replicate_annotate_rows_events;
 extern ulonglong relay_log_space_limit;
 
 /*
diff --git a/sql/sp.cc b/sql/sp.cc
index 982dc3dc91b..6071a60a64f 100644
--- a/sql/sp.cc
+++ b/sql/sp.cc
@@ -1141,7 +1141,7 @@ sp_create_routine(THD *thd, int type, sp_head *sp)
                          (sp->m_explicit_name ? sp->m_db.length : 0), 
                          sp->m_name.str, sp->m_name.length,
                          sp->m_params.str, sp->m_params.length,
-                         retstr.c_ptr(), retstr.length(),
+                         retstr.ptr(), retstr.length(),
                          sp->m_body.str, sp->m_body.length,
                          sp->m_chistics, &(thd->lex->definer->user),
                          &(thd->lex->definer->host),
@@ -1154,7 +1154,7 @@ sp_create_routine(THD *thd, int type, sp_head *sp)
       thd->variables.sql_mode= saved_mode;
       /* Such a statement can always go directly to binlog, no trans cache */
       if (thd->binlog_query(THD::STMT_QUERY_TYPE,
-                            log_query.c_ptr(), log_query.length(),
+                            log_query.ptr(), log_query.length(),
                             FALSE, FALSE, FALSE, 0))
         ret= SP_INTERNAL_ERROR;
       thd->variables.sql_mode= 0;
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index df258d49c0a..6e17c974c2c 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -26,6 +26,7 @@
 #include "sql_acl.h"           // *_ACL
 #include "sql_array.h"         // Dynamic_array
 #include "log_event.h"         // append_query_string, Query_log_event
+#include "sql_derived.h"       // mysql_handle_derived
 
 #ifdef USE_PRAGMA_IMPLEMENTATION
 #pragma implementation
@@ -62,19 +63,7 @@ extern "C" uchar *sp_table_key(const uchar *ptr, size_t *plen, my_bool first);
 static void reset_start_time_for_sp(THD *thd)
 {
   if (!thd->in_sub_stmt)
-  {
-    /*
-      First investigate if there is a cached time stamp
-    */
-    if (thd->user_time)
-    {
-      thd->start_time= thd->user_time;
-    }
-    else
-    {
-      my_micro_time_and_time(&thd->start_time);
-    }
-  }
+    thd->set_start_time();
 }
 
 Item_result
@@ -3066,6 +3055,9 @@ int sp_instr::exec_open_and_lock_tables(THD *thd, TABLE_LIST *tables)
     result= -1;
   else
     result= 0;
+  /* Prepare all derived tables/views to catch possible errors. */
+  if (!result)
+    result= mysql_handle_derived(thd->lex, DT_PREPARE) ? -1 : 0;
 
   return result;
 }
diff --git a/sql/sp_rcontext.cc b/sql/sp_rcontext.cc
index e76a5e9ebde..7b8ade26085 100644
--- a/sql/sp_rcontext.cc
+++ b/sql/sp_rcontext.cc
@@ -123,7 +123,7 @@ sp_rcontext::init_var_table(THD *thd)
     return TRUE;
 
   m_var_table->copy_blobs= TRUE;
-  m_var_table->alias= "";
+  m_var_table->alias.set("", 0, table_alias_charset);
 
   return FALSE;
 }
@@ -716,7 +716,7 @@ int Select_fetch_into_spvars::prepare(List<Item> &fields, SELECT_LEX_UNIT *u)
 }
 
 
-bool Select_fetch_into_spvars::send_data(List<Item> &items)
+int Select_fetch_into_spvars::send_data(List<Item> &items)
 {
   List_iterator_fast<struct sp_variable> spvar_iter(*spvar_list);
   List_iterator_fast<Item> item_iter(items);
@@ -733,7 +733,7 @@ bool Select_fetch_into_spvars::send_data(List<Item> &items)
   for (; spvar= spvar_iter++, item= item_iter++; )
   {
     if (thd->spcont->set_variable(thd, spvar->offset, &item))
-      return TRUE;
+      return 1;
   }
-  return FALSE;
+  return 0;
 }
diff --git a/sql/sp_rcontext.h b/sql/sp_rcontext.h
index 95b865be491..c8903911b83 100644
--- a/sql/sp_rcontext.h
+++ b/sql/sp_rcontext.h
@@ -275,7 +275,7 @@ public:
   void set_spvar_list(List<struct sp_variable> *vars) { spvar_list= vars; }
 
   virtual bool send_eof() { return FALSE; }
-  virtual bool send_data(List<Item> &items);
+  virtual int send_data(List<Item> &items);
   virtual int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
 };
 
diff --git a/sql/sql_acl.cc b/sql/sql_acl.cc
index 61c4d05425c..650c9d4e458 100644
--- a/sql/sql_acl.cc
+++ b/sql/sql_acl.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -3562,7 +3563,7 @@ int mysql_table_grant(THD *thd, TABLE_LIST *table_list,
       class LEX_COLUMN *column;
       List_iterator <LEX_COLUMN> column_iter(columns);
 
-      if (open_normal_and_derived_tables(thd, table_list, 0))
+      if (open_normal_and_derived_tables(thd, table_list, 0, DT_PREPARE))
         DBUG_RETURN(TRUE);
 
       while ((column = column_iter++))
@@ -6088,16 +6089,16 @@ static int handle_grant_struct(uint struct_no, bool drop,
     elements= acl_dbs.elements;
     break;
   case 2:
-    elements= column_priv_hash.records;
     grant_name_hash= &column_priv_hash;
+    elements= grant_name_hash->records;
     break;
   case 3:
-    elements= proc_priv_hash.records;
     grant_name_hash= &proc_priv_hash;
+    elements= grant_name_hash->records;
     break;
   case 4:
-    elements= func_priv_hash.records;
     grant_name_hash= &func_priv_hash;
+    elements= grant_name_hash->records;
     break;
   case 5:
     elements= acl_proxy_users.elements;
@@ -6310,8 +6311,7 @@ static int handle_grant_data(TABLE_LIST *tables, bool drop,
   else
   {
     /* Handle user array. */
-    if ((handle_grant_struct(0, drop, user_from, user_to) && ! result) ||
-        found)
+    if ((handle_grant_struct(0, drop, user_from, user_to)) || found)
     {
       result= 1; /* At least one record/element found. */
       /* If search is requested, we do not need to search further. */
@@ -7872,7 +7872,6 @@ get_cached_table_access(GRANT_INTERNAL_INFO *grant_internal_info,
 #undef HAVE_OPENSSL
 #ifdef NO_EMBEDDED_ACCESS_CHECKS
 #define initialized 0
-#define decrease_user_connections(X)        /* nothing */
 #define check_for_max_user_connections(X,Y)   0
 #define get_or_create_user_conn(A,B,C,D) 0
 #endif
@@ -7912,16 +7911,17 @@ struct MPVIO_EXT :public MYSQL_PLUGIN_VIO
 /**
   a helper function to report an access denied error in all the proper places
 */
-static void login_failed_error(THD *thd, int passwd_used)
+static void login_failed_error(THD *thd)
 {
-  my_error(access_denied_error_code(passwd_used), MYF(0),
+  my_error(access_denied_error_code(thd->password), MYF(0),
            thd->main_security_ctx.user,
            thd->main_security_ctx.host_or_ip,
-           passwd_used ? ER(ER_YES) : ER(ER_NO));
-  general_log_print(thd, COM_CONNECT, ER(access_denied_error_code(passwd_used)),
+           thd->password ? ER(ER_YES) : ER(ER_NO));
+  general_log_print(thd, COM_CONNECT,
+                    ER(access_denied_error_code(thd->password)),
                     thd->main_security_ctx.user,
                     thd->main_security_ctx.host_or_ip,
-                    passwd_used ? ER(ER_YES) : ER(ER_NO));
+                    thd->password ? ER(ER_YES) : ER(ER_NO));
   status_var_increment(thd->status_var.access_denied_errors);
   /* 
     Log access denied messages to the error log when log-warnings = 2
@@ -7930,10 +7930,10 @@ static void login_failed_error(THD *thd, int passwd_used)
   */
   if (global_system_variables.log_warnings > 1)
   {
-    sql_print_warning(ER(access_denied_error_code(passwd_used)),
+    sql_print_warning(ER(access_denied_error_code(thd->password)),
                       thd->main_security_ctx.user,
                       thd->main_security_ctx.host_or_ip,
-                      passwd_used ? ER(ER_YES) : ER(ER_NO));      
+                      thd->password ? ER(ER_YES) : ER(ER_NO));      
   }
 }
 
@@ -8198,7 +8198,7 @@ static bool find_mpvio_user(MPVIO_EXT *mpvio)
 
   if (!mpvio->acl_user)
   {
-    login_failed_error(mpvio->thd, 0);
+    login_failed_error(mpvio->thd);
     DBUG_RETURN (1);
   }
 
@@ -8381,7 +8381,6 @@ static ulong parse_client_handshake_packet(MPVIO_EXT *mpvio,
   THD *thd= mpvio->thd;
   NET *net= &thd->net;
   char *end;
-
   DBUG_ASSERT(mpvio->status == MPVIO_EXT::FAILURE);
 
   if (pkt_len < MIN_HANDSHAKE_SIZE)
@@ -8393,7 +8392,7 @@ static ulong parse_client_handshake_packet(MPVIO_EXT *mpvio,
   ulong client_capabilities= uint2korr(net->read_pos);
   if (client_capabilities & CLIENT_PROTOCOL_41)
   {
-    client_capabilities|= ((ulong) uint2korr(net->read_pos + 2)) << 16;
+    client_capabilities|= ((ulonglong) uint2korr(net->read_pos + 2)) << 16;
     thd->max_client_packet_length= uint4korr(net->read_pos + 4);
     DBUG_PRINT("info", ("client_character_set: %d", (uint) net->read_pos[8]));
     if (thd_init_client_charset(thd, (uint) net->read_pos[8]))
@@ -8469,21 +8468,15 @@ static ulong parse_client_handshake_packet(MPVIO_EXT *mpvio,
   uint passwd_len= thd->client_capabilities & CLIENT_SECURE_CONNECTION ?
                    (uchar)(*passwd++) : strlen(passwd);
 
-  if (thd->client_capabilities & CLIENT_CONNECT_WITH_DB)
-  {
-    db= db + passwd_len + 1;
-    /* strlen() can't be easily deleted without changing protocol */
-    db_len= strlen(db);
-  }
-  else
-  {
-    db= 0;
-    db_len= 0;
-  }
+  db= thd->client_capabilities & CLIENT_CONNECT_WITH_DB ?
+    db + passwd_len + 1 : 0;
 
-  if (passwd + passwd_len + db_len > (char *)net->read_pos + pkt_len)
+  if (passwd + passwd_len + test(db) > (char *)net->read_pos + pkt_len)
     return packet_error;
 
+  /* strlen() can't be easily deleted without changing protocol */
+  db_len= db ? strlen(db) : 0;
+
   char *client_plugin= passwd + passwd_len + (db ? db_len + 1 : 0);
 
   /* Since 4.1 all database names are stored in utf8 */
@@ -8506,6 +8499,19 @@ static ulong parse_client_handshake_packet(MPVIO_EXT *mpvio,
     user_len-= 2;
   }
 
+  /*
+    Clip username to allowed length in characters (not bytes).  This is
+    mostly for backward compatibility.
+  */
+  {
+    CHARSET_INFO *cs= system_charset_info;
+    int           err;
+
+    user_len= (uint) cs->cset->well_formed_len(cs, user, user + user_len,
+                                               USERNAME_CHAR_LENGTH, &err);
+    user[user_len]= '\0';
+  }
+
   Security_context *sctx= thd->security_ctx;
 
   if (thd->make_lex_string(&mpvio->db, db, db_len, 0) == 0)
@@ -8529,13 +8535,13 @@ static ulong parse_client_handshake_packet(MPVIO_EXT *mpvio,
     return packet_error;
   }
 
+  thd->password= passwd_len > 0;
   if (find_mpvio_user(mpvio))
     return packet_error;
 
   if (thd->client_capabilities & CLIENT_PLUGIN_AUTH)
   {
-    if ((client_plugin + strlen(client_plugin)) > 
-          (char *)net->read_pos + pkt_len)
+    if (client_plugin >= (char *)net->read_pos + pkt_len)
       return packet_error;
     client_plugin= fix_plugin_ptr(client_plugin);
   }
@@ -9020,7 +9026,7 @@ bool acl_authenticate(THD *thd, uint connect_errors,
     DBUG_ASSERT(mpvio.status == MPVIO_EXT::FAILURE);
 
     if (!thd->is_error())
-      login_failed_error(thd, thd->password);
+      login_failed_error(thd);
     DBUG_RETURN(1);
   }
 
@@ -9044,7 +9050,7 @@ bool acl_authenticate(THD *thd, uint connect_errors,
       if (!proxy_user)
       {
         if (!thd->is_error())
-          login_failed_error(thd, mpvio.auth_info.password_used);
+          login_failed_error(thd);
         DBUG_RETURN(1);
       }
 
@@ -9060,7 +9066,7 @@ bool acl_authenticate(THD *thd, uint connect_errors,
       if (!acl_proxy_user)
       {
         if (!thd->is_error())
-          login_failed_error(thd, mpvio.auth_info.password_used);
+          login_failed_error(thd);
         mysql_mutex_unlock(&acl_cache->lock);
         DBUG_RETURN(1);
       }
@@ -9087,7 +9093,7 @@ bool acl_authenticate(THD *thd, uint connect_errors,
     */
     if (acl_check_ssl(thd, acl_user))
     {
-      login_failed_error(thd, thd->password);
+      login_failed_error(thd);
       DBUG_RETURN(1);
     }
 
@@ -9111,6 +9117,8 @@ bool acl_authenticate(THD *thd, uint connect_errors,
        global_system_variables.max_user_connections) &&
       check_for_max_user_connections(thd, thd->user_connect))
   {
+    /* Ensure we don't decrement thd->user_connections->connections twice */
+    thd->user_connect= 0;
     status_var_increment(denied_connections);
     DBUG_RETURN(1); // The error is set in check_for_max_user_connections()
   }
@@ -9151,12 +9159,7 @@ bool acl_authenticate(THD *thd, uint connect_errors,
     if (mysql_change_db(thd, &mpvio.db, FALSE))
     {
       /* mysql_change_db() has pushed the error message. */
-      if (thd->user_connect)
-      {
-        status_var_increment(thd->status_var.access_denied_errors);
-        decrease_user_connections(thd->user_connect);
-        thd->user_connect= 0;
-      }
+      status_var_increment(thd->status_var.access_denied_errors);
       DBUG_RETURN(1);
     }
   }
diff --git a/sql/sql_admin.cc b/sql/sql_admin.cc
index 709d1b9a701..5c529e99fac 100644
--- a/sql/sql_admin.cc
+++ b/sql/sql_admin.cc
@@ -309,6 +309,7 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
   int result_code;
   bool need_repair_or_alter= 0;
   DBUG_ENTER("mysql_admin_table");
+  DBUG_PRINT("enter", ("extra_open_options: %u", extra_open_options));
 
   field_list.push_back(item = new Item_empty_string("Table", NAME_CHAR_LEN*2));
   item->maybe_null = 1;
@@ -332,9 +333,7 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
     bool open_error;
 
     DBUG_PRINT("admin", ("table: '%s'.'%s'", table->db, table->table_name));
-    DBUG_PRINT("admin", ("extra_open_options: %u", extra_open_options));
     strxmov(table_name, db, ".", table->table_name, NullS);
-    thd->open_options|= extra_open_options;
     table->lock_type= lock_type;
     /*
       To make code safe for re-execution we need to reset type of MDL
@@ -365,6 +364,13 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
       if (view_operator_func == NULL)
         table->required_type=FRMTYPE_TABLE;
 
+      if (lex->sql_command == SQLCOM_CHECK ||
+          lex->sql_command == SQLCOM_REPAIR ||
+          lex->sql_command == SQLCOM_ANALYZE ||
+          lex->sql_command == SQLCOM_OPTIMIZE)
+	thd->prepare_derived_at_open= TRUE;
+
+      thd->open_options|= extra_open_options;
       if (!thd->locked_tables_mode && repair_table_use_frm)
       {
         /*
@@ -397,10 +403,11 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
 
         open_error= open_and_lock_tables(thd, table, TRUE, 0);
       }
+      thd->open_options&= ~extra_open_options;
+      thd->prepare_derived_at_open= FALSE;
 
       table->next_global= save_next_global;
       table->next_local= save_next_local;
-      thd->open_options&= ~extra_open_options;
 
       /*
         If open_and_lock_tables() failed, close_thread_tables() will close
diff --git a/sql/sql_alter.cc b/sql/sql_alter.cc
index 5af01523aa7..2b0e0fecdaa 100644
--- a/sql/sql_alter.cc
+++ b/sql/sql_alter.cc
@@ -103,7 +103,7 @@ bool Alter_table_statement::execute(THD *thd)
                             &alter_info,
                             select_lex->order_list.elements,
                             select_lex->order_list.first,
-                            lex->ignore);
+                            lex->ignore, lex->online);
 
   DBUG_RETURN(result);
 }
diff --git a/sql/sql_analyse.cc b/sql/sql_analyse.cc
index a2aeeb86072..883ac3c4660 100644
--- a/sql/sql_analyse.cc
+++ b/sql/sql_analyse.cc
@@ -748,7 +748,7 @@ int analyse::end_of_records()
 	tmp_str.append(STRING_WITH_LEN(" NOT NULL"));
       output_str_length = tmp_str.length();
       func_items[9]->set(tmp_str.ptr(), tmp_str.length(), tmp_str.charset());
-      if (result->send_data(result_fields))
+      if (result->send_data(result_fields) > 0)
 	return -1;
       continue;
     }
@@ -793,7 +793,7 @@ int analyse::end_of_records()
     if (!(*f)->nulls)
       ans.append(STRING_WITH_LEN(" NOT NULL"));
     func_items[9]->set(ans.ptr(), ans.length(), ans.charset());
-    if (result->send_data(result_fields))
+    if (result->send_data(result_fields) > 0)
       return -1;
   }
   return 0;
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index 7e0d17d1b0e..49cd948791b 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2010, 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -1620,12 +1621,11 @@ bool close_temporary_tables(THD *thd)
 
   /* Better add "if exists", in case a RESET MASTER has been done */
   const char stub[]= "DROP /*!40005 TEMPORARY */ TABLE IF EXISTS ";
-  uint stub_len= sizeof(stub) - 1;
-  char buf[256];
-  String s_query= String(buf, sizeof(buf), system_charset_info);
+  char buf[FN_REFLEN];
+  String s_query(buf, sizeof(buf), system_charset_info);
   bool found_user_tables= FALSE;
 
-  memcpy(buf, stub, stub_len);
+  s_query.copy(stub, sizeof(stub)-1, system_charset_info);
 
   /*
     Insertion sort of temp tables by pseudo_thread_id to build ordered list
@@ -1679,19 +1679,25 @@ bool close_temporary_tables(THD *thd)
     {
       bool save_thread_specific_used= thd->thread_specific_used;
       my_thread_id save_pseudo_thread_id= thd->variables.pseudo_thread_id;
+      char db_buf[FN_REFLEN];
+      String db(db_buf, sizeof(db_buf), system_charset_info);
+
       /* Set pseudo_thread_id to be that of the processed table */
       thd->variables.pseudo_thread_id= tmpkeyval(thd, table);
-      String db;
-      db.append(table->s->db.str);
+
+      db.copy(table->s->db.str, table->s->db.length, system_charset_info);
+      /* Reset s_query() if changed by previous loop */
+      s_query.length(sizeof(stub)-1);
+
       /* Loop forward through all tables that belong to a common database
          within the sublist of common pseudo_thread_id to create single
          DROP query 
       */
-      for (s_query.length(stub_len);
+      for (;
            table && is_user_table(table) &&
              tmpkeyval(thd, table) == thd->variables.pseudo_thread_id &&
              table->s->db.length == db.length() &&
-             strcmp(table->s->db.str, db.ptr()) == 0;
+             memcmp(table->s->db.str, db.ptr(), db.length()) == 0;
            table= next)
       {
         /*
@@ -2097,7 +2103,7 @@ int drop_temporary_table(THD *thd, TABLE_LIST *table_list, bool *is_trans)
   /* Table might be in use by some outer statement. */
   if (table->query_id && table->query_id != thd->query_id)
   {
-    my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias);
+    my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias.c_ptr());
     DBUG_RETURN(-1);
   }
 
@@ -2123,7 +2129,7 @@ void close_temporary_table(THD *thd, TABLE *table,
   DBUG_ENTER("close_temporary_table");
   DBUG_PRINT("tmptable", ("closing table: '%s'.'%s' 0x%lx  alias: '%s'",
                           table->s->db.str, table->s->table_name.str,
-                          (long) table, table->alias));
+                          (long) table, table->alias.c_ptr()));
 
   if (table->prev)
   {
@@ -2671,7 +2677,7 @@ bool open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
                      ("query_id: %lu  server_id: %u  pseudo_thread_id: %lu",
                       (ulong) table->query_id, (uint) thd->server_id,
                       (ulong) thd->variables.pseudo_thread_id));
-	  my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias);
+	  my_error(ER_CANT_REOPEN_TABLE, MYF(0), table->alias.c_ptr());
 	  DBUG_RETURN(TRUE);
 	}
 	table->query_id= thd->query_id;
@@ -2711,7 +2717,7 @@ bool open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
       if (table->s->table_cache_key.length == key_length &&
 	  !memcmp(table->s->table_cache_key.str, key, key_length))
       {
-        if (!my_strcasecmp(system_charset_info, table->alias, alias) &&
+        if (!my_strcasecmp(system_charset_info, table->alias.c_ptr(), alias) &&
             table->query_id != thd->query_id && /* skip tables already used */
             (thd->locked_tables_mode == LTM_LOCK_TABLES ||
              table->query_id == 0))
@@ -5236,7 +5242,8 @@ static bool check_lock_and_start_stmt(THD *thd,
   if ((int) lock_type > (int) TL_WRITE_ALLOW_WRITE &&
       (int) table_list->table->reginfo.lock_type <= (int) TL_WRITE_ALLOW_WRITE)
   {
-    my_error(ER_TABLE_NOT_LOCKED_FOR_WRITE, MYF(0), table_list->alias);
+    my_error(ER_TABLE_NOT_LOCKED_FOR_WRITE, MYF(0),
+             table_list->table->alias.c_ptr());
     DBUG_RETURN(1);
   }
   if ((error= table_list->table->file->start_stmt(thd, lock_type)))
@@ -5463,16 +5470,11 @@ bool open_and_lock_tables(THD *thd, TABLE_LIST *tables,
 
   if (derived)
   {
-    if (mysql_handle_derived(thd->lex, &mysql_derived_prepare))
+    if (mysql_handle_derived(thd->lex, DT_INIT))
       goto err;
-    if (thd->fill_derived_tables() &&
-        mysql_handle_derived(thd->lex, &mysql_derived_filling))
-    {
-      mysql_handle_derived(thd->lex, &mysql_derived_cleanup);
+    if (thd->prepare_derived_at_open &&
+        (mysql_handle_derived(thd->lex, DT_PREPARE)))
       goto err;
-    }
-    if (!thd->lex->describe)
-      mysql_handle_derived(thd->lex, &mysql_derived_cleanup);
   }
 
   DBUG_RETURN(FALSE);
@@ -5496,6 +5498,7 @@ err:
     flags       - bitmap of flags to modify how the tables will be open:
                   MYSQL_LOCK_IGNORE_FLUSH - open table even if someone has
                   done a flush on it.
+    dt_phases   - set of flags to pass to the mysql_handle_derived
 
   RETURN
     FALSE - ok
@@ -5506,7 +5509,8 @@ err:
     data from the tables.
 */
 
-bool open_normal_and_derived_tables(THD *thd, TABLE_LIST *tables, uint flags)
+bool open_normal_and_derived_tables(THD *thd, TABLE_LIST *tables, uint flags,
+                                    uint dt_phases)
 {
   DML_prelocking_strategy prelocking_strategy;
   uint counter;
@@ -5514,7 +5518,7 @@ bool open_normal_and_derived_tables(THD *thd, TABLE_LIST *tables, uint flags)
   DBUG_ENTER("open_normal_and_derived_tables");
   DBUG_ASSERT(!thd->fill_derived_tables());
   if (open_tables(thd, &tables, &counter, flags, &prelocking_strategy) ||
-      mysql_handle_derived(thd->lex, &mysql_derived_prepare))
+      mysql_handle_derived(thd->lex, dt_phases))
     goto end;
 
   DBUG_RETURN(0);
@@ -6021,9 +6025,7 @@ find_field_in_view(THD *thd, TABLE_LIST *table_list,
   Field_iterator_view field_it;
   field_it.set(table_list);
   Query_arena *arena= 0, backup;  
-  
-  DBUG_ASSERT(table_list->schema_table_reformed ||
-              (ref != 0 && table_list->view != 0));
+
   for (; !field_it.end_of_fields(); field_it.next())
   {
     if (!my_strcasecmp(system_charset_info, field_it.name(), name))
@@ -6042,6 +6044,8 @@ find_field_in_view(THD *thd, TABLE_LIST *table_list,
       
       if (!item)
         DBUG_RETURN(0);
+      if (!ref)
+        DBUG_RETURN((Field*) view_ref_found);
       /*
        *ref != NULL means that *ref contains the item that we need to
        replace. If the item was aliased by the user, set the alias to
@@ -6050,10 +6054,15 @@ find_field_in_view(THD *thd, TABLE_LIST *table_list,
       */
       if (*ref && !(*ref)->is_autogenerated_name)
       {
+        if (register_tree_change &&
+            thd->stmt_arena->is_stmt_prepare_or_first_stmt_execute())
+          arena= thd->activate_stmt_arena_if_needed(&backup);
         item->set_name((*ref)->name, (*ref)->name_length,
                        system_charset_info);
         item->real_item()->set_name((*ref)->name, (*ref)->name_length,
                        system_charset_info);
+        if (arena)
+          thd->restore_active_arena(arena, &backup);
       }
       if (register_tree_change)
         thd->change_item_tree(ref, item);
@@ -6232,7 +6241,8 @@ find_field_in_table(THD *thd, TABLE *table, const char *name, uint length,
   Field **field_ptr, *field;
   uint cached_field_index= *cached_field_index_ptr;
   DBUG_ENTER("find_field_in_table");
-  DBUG_PRINT("enter", ("table: '%s', field name: '%s'", table->alias, name));
+  DBUG_PRINT("enter", ("table: '%s', field name: '%s'", table->alias.c_ptr(),
+                       name));
 
   /* We assume here that table->field < NO_CACHED_FIELD_INDEX = UINT_MAX */
   if (cached_field_index < table->s->fields &&
@@ -6447,6 +6457,8 @@ find_field_in_table_ref(THD *thd, TABLE_LIST *table_list,
         Field *field_to_set= NULL;
         if (fld == view_ref_found)
         {
+          if (!ref)
+            DBUG_RETURN(fld);
           Item *it= (*ref)->real_item();
           if (it->type() == Item::FIELD_ITEM)
             field_to_set= ((Item_field*)it)->field;
@@ -6454,6 +6466,8 @@ find_field_in_table_ref(THD *thd, TABLE_LIST *table_list,
           {
             if (thd->mark_used_columns == MARK_COLUMNS_READ)
               it->walk(&Item::register_field_in_read_map, 1, (uchar *) 0);
+            else
+              it->walk(&Item::register_field_in_write_map, 1, (uchar *) 0);
           }
         }
         else
@@ -6593,8 +6607,11 @@ find_field_in_tables(THD *thd, Item_ident *item,
       find_field_in_table even in the case of information schema tables
       when table_ref->field_translation != NULL.
       */
-    if (table_ref->table && !table_ref->view)
+    if (table_ref->table && !table_ref->view &&
+        (!table_ref->is_merged_derived() ||
+         (!table_ref->is_multitable() && table_ref->merged_for_insert)))
     {
+
       found= find_field_in_table(thd, table_ref->table, name, length,
                                  TRUE, &(item->cached_field_index));
 #ifndef NO_EMBEDDED_ACCESS_CHECKS
@@ -6619,7 +6636,8 @@ find_field_in_tables(THD *thd, Item_ident *item,
         Only views fields should be marked as dependent, not an underlying
         fields.
       */
-      if (!table_ref->belong_to_view)
+      if (!table_ref->belong_to_view &&
+          !table_ref->belong_to_derived)
       {
         SELECT_LEX *current_sel= thd->lex->current_select;
         SELECT_LEX *last_select= table_ref->select_lex;
@@ -6631,11 +6649,6 @@ find_field_in_tables(THD *thd, Item_ident *item,
         {
           mark_select_range_as_dependent(thd, last_select, current_sel,
                                          found, *ref, item);
-          if (item->can_be_depended)
-          {
-            DBUG_ASSERT((*ref) == (Item*)item);
-            current_sel->register_dependency_item(last_select, ref);
-          }
         }
       }
       return found;
@@ -7214,6 +7227,10 @@ mark_common_columns(THD *thd, TABLE_LIST *table_ref_1, TABLE_LIST *table_ref_2,
     */
     if (nj_col_2 && (!using_fields ||is_using_column_1))
     {
+      /*
+        Create non-fixed fully qualified field and let fix_fields to
+        resolve it.
+      */
       Item *item_1=   nj_col_1->create_item(thd);
       Item *item_2=   nj_col_2->create_item(thd);
       Field *field_1= nj_col_1->field();
@@ -7659,7 +7676,11 @@ static bool setup_natural_join_row_types(THD *thd,
   for (left_neighbor= table_ref_it++; left_neighbor ; )
   {
     table_ref= left_neighbor;
-    left_neighbor= table_ref_it++;
+    do
+    {
+      left_neighbor= table_ref_it++;
+    }
+    while (left_neighbor && left_neighbor->sj_subq_pred);
     /* 
       Do not redo work if already done:
       1) for stored procedures,
@@ -7703,13 +7724,11 @@ int setup_wild(THD *thd, TABLE_LIST *tables, List<Item> &fields,
 	       List<Item> *sum_func_list,
 	       uint wild_num)
 {
-  if (!wild_num)
-    return(0);
-
   Item *item;
   List_iterator<Item> it(fields);
   Query_arena *arena, backup;
   DBUG_ENTER("setup_wild");
+  DBUG_ASSERT(wild_num != 0);
 
   /*
     Don't use arena if we are not in prepared statements or stored procedures
@@ -7798,6 +7817,7 @@ bool setup_fields(THD *thd, Item **ref_pointer_array,
   List_iterator<Item> it(fields);
   bool save_is_item_list_lookup;
   DBUG_ENTER("setup_fields");
+  DBUG_PRINT("enter", ("ref_pointer_array: %p", ref_pointer_array));
 
   thd->mark_used_columns= mark_used_columns;
   DBUG_PRINT("info", ("thd->mark_used_columns: %d", thd->mark_used_columns));
@@ -7875,27 +7895,36 @@ bool setup_fields(THD *thd, Item **ref_pointer_array,
     make_leaves_list()
     list    pointer to pointer on list first element
     tables  table list
+    full_table_list whether to include tables from mergeable derived table/view.
+                    we need them for checks for INSERT/UPDATE statements only.
 
   RETURN pointer on pointer to next_leaf of last element
 */
 
-TABLE_LIST **make_leaves_list(TABLE_LIST **list, TABLE_LIST *tables)
+void make_leaves_list(List<TABLE_LIST> &list, TABLE_LIST *tables,
+                      bool full_table_list, TABLE_LIST *boundary)
+ 
 {
   for (TABLE_LIST *table= tables; table; table= table->next_local)
   {
-    if (table->merge_underlying_list)
+    if (table == boundary)
+      full_table_list= !full_table_list;
+    if (full_table_list && table->is_merged_derived())
     {
-      DBUG_ASSERT(table->view &&
-                  table->effective_algorithm == VIEW_ALGORITHM_MERGE);
-      list= make_leaves_list(list, table->merge_underlying_list);
+      SELECT_LEX *select_lex= table->get_single_select();
+      /*
+        It's safe to use select_lex->leaf_tables because all derived
+        tables/views were already prepared and has their leaf_tables
+        set properly.
+      */
+      make_leaves_list(list, select_lex->get_table_list(),
+      full_table_list, boundary);
     }
     else
     {
-      *list= table;
-      list= &table->next_leaf;
+      list.push_back(table);
     }
   }
-  return list;
 }
 
 /*
@@ -7910,6 +7939,7 @@ TABLE_LIST **make_leaves_list(TABLE_LIST **list, TABLE_LIST *tables)
     leaves        List of join table leaves list (select_lex->leaf_tables)
     refresh       It is onle refresh for subquery
     select_insert It is SELECT ... INSERT command
+    full_table_list a parameter to pass to the make_leaves_list function
 
   NOTE
     Check also that the 'used keys' and 'ignored keys' exists and set up the
@@ -7928,9 +7958,13 @@ TABLE_LIST **make_leaves_list(TABLE_LIST **list, TABLE_LIST *tables)
 
 bool setup_tables(THD *thd, Name_resolution_context *context,
                   List<TABLE_LIST> *from_clause, TABLE_LIST *tables,
-                  TABLE_LIST **leaves, bool select_insert)
+                  List<TABLE_LIST> &leaves, bool select_insert,
+                  bool full_table_list)
 {
   uint tablenr= 0;
+  List_iterator<TABLE_LIST> ti(leaves);
+  TABLE_LIST *table_list;
+
   DBUG_ENTER("setup_tables");
 
   DBUG_ASSERT ((select_insert && !tables->next_name_resolution_table) || !tables || 
@@ -7942,40 +7976,83 @@ bool setup_tables(THD *thd, Name_resolution_context *context,
   TABLE_LIST *first_select_table= (select_insert ?
                                    tables->next_local:
                                    0);
-  if (!(*leaves))
-    make_leaves_list(leaves, tables);
-
-  TABLE_LIST *table_list;
-  for (table_list= *leaves;
-       table_list;
-       table_list= table_list->next_leaf, tablenr++)
+  SELECT_LEX *select_lex= select_insert ? &thd->lex->select_lex :
+                                          thd->lex->current_select;
+  if (select_lex->first_cond_optimization)
   {
-    TABLE *table= table_list->table;
-    table->pos_in_table_list= table_list;
-    if (first_select_table &&
-        table_list->top_table() == first_select_table)
+    leaves.empty();
+    if (!select_lex->is_prep_leaf_list_saved)
+    {
+      make_leaves_list(leaves, tables, full_table_list, first_select_table);
+      select_lex->leaf_tables_exec.empty();
+    }
+    else
+    {
+      List_iterator_fast <TABLE_LIST> ti(select_lex->leaf_tables_prep);
+      while ((table_list= ti++))
+        leaves.push_back(table_list);
+    }
+      
+    while ((table_list= ti++))
     {
-      /* new counting for SELECT of INSERT ... SELECT command */
-      first_select_table= 0;
-      tablenr= 0;
+      TABLE *table= table_list->table;
+      table->pos_in_table_list= table_list;
+      if (first_select_table &&
+          table_list->top_table() == first_select_table)
+      {
+        /* new counting for SELECT of INSERT ... SELECT command */
+        first_select_table= 0;
+        thd->lex->select_lex.insert_tables= tablenr;
+        tablenr= 0;
+      }
+      if(table_list->jtbm_subselect)
+      {
+        table_list->jtbm_table_no= tablenr;
+      }
+      else
+      {
+        table->pos_in_table_list= table_list;
+        setup_table_map(table, table_list, tablenr);
+
+        if (table_list->process_index_hints(table))
+          DBUG_RETURN(1);
+      }
+      tablenr++;
     }
-    setup_table_map(table, table_list, tablenr);
-    if (table_list->process_index_hints(table))
+    if (tablenr > MAX_TABLES)
+    {
+      my_error(ER_TOO_MANY_TABLES,MYF(0), static_cast<int>(MAX_TABLES));
       DBUG_RETURN(1);
+    }
   }
-  if (tablenr > MAX_TABLES)
-  {
-    my_error(ER_TOO_MANY_TABLES,MYF(0), static_cast<int>(MAX_TABLES));
-    DBUG_RETURN(1);
-  }
+  else
+  { 
+    List_iterator_fast <TABLE_LIST> ti(select_lex->leaf_tables_exec);
+    select_lex->leaf_tables.empty();
+    while ((table_list= ti++))
+    {
+      if(table_list->jtbm_subselect)
+      {
+        table_list->jtbm_table_no= table_list->tablenr_exec;
+      }
+      else
+      {
+        table_list->table->tablenr= table_list->tablenr_exec;
+        table_list->table->map= table_list->map_exec;
+        table_list->table->maybe_null= table_list->maybe_null_exec;
+        table_list->table->pos_in_table_list= table_list;
+      }
+      select_lex->leaf_tables.push_back(table_list);
+    }
+  }    
+
   for (table_list= tables;
        table_list;
        table_list= table_list->next_local)
   {
     if (table_list->merge_underlying_list)
     {
-      DBUG_ASSERT(table_list->view &&
-                  table_list->effective_algorithm == VIEW_ALGORITHM_MERGE);
+      DBUG_ASSERT(table_list->is_merged_derived());
       Query_arena *arena= thd->stmt_arena, backup;
       bool res;
       if (arena->is_conventional())
@@ -7988,6 +8065,17 @@ bool setup_tables(THD *thd, Name_resolution_context *context,
       if (res)
         DBUG_RETURN(1);
     }
+
+    if (table_list->jtbm_subselect)
+    {
+      Item *item= table_list->jtbm_subselect->optimizer;
+      if (table_list->jtbm_subselect->optimizer->fix_fields(thd, &item))
+      {
+        my_error(ER_TOO_MANY_TABLES,MYF(0),MAX_TABLES); /* psergey-todo: WHY ER_TOO_MANY_TABLES ???*/
+        DBUG_RETURN(1);
+      }
+      DBUG_ASSERT(item == table_list->jtbm_subselect->optimizer);
+    }
   }
 
   /* Precompute and store the row types of NATURAL/USING joins. */
@@ -8002,7 +8090,7 @@ bool setup_tables(THD *thd, Name_resolution_context *context,
   prepare tables and check access for the view tables
 
   SYNOPSIS
-    setup_tables_and_check_view_access()
+    setup_tables_and_check_access()
     thd		  Thread handler
     context       name resolution contest to setup table list there
     from_clause   Top-level list of table references in the FROM clause
@@ -8012,6 +8100,7 @@ bool setup_tables(THD *thd, Name_resolution_context *context,
     refresh       It is onle refresh for subquery
     select_insert It is SELECT ... INSERT command
     want_access   what access is needed
+    full_table_list a parameter to pass to the make_leaves_list function
 
   NOTE
     a wrapper for check_tables that will also check the resulting
@@ -8025,33 +8114,33 @@ bool setup_tables_and_check_access(THD *thd,
                                    Name_resolution_context *context,
                                    List<TABLE_LIST> *from_clause,
                                    TABLE_LIST *tables,
-                                   TABLE_LIST **leaves,
+                                   List<TABLE_LIST> &leaves,
                                    bool select_insert,
                                    ulong want_access_first,
-                                   ulong want_access)
+                                   ulong want_access,
+                                   bool full_table_list)
 {
-  TABLE_LIST *leaves_tmp= NULL;
   bool first_table= true;
+  DBUG_ENTER("setup_tables_and_check_access");
 
   if (setup_tables(thd, context, from_clause, tables,
-                   &leaves_tmp, select_insert))
-    return TRUE;
-
-  if (leaves)
-    *leaves= leaves_tmp;
+                   leaves, select_insert, full_table_list))
+    DBUG_RETURN(TRUE);
 
-  for (; leaves_tmp; leaves_tmp= leaves_tmp->next_leaf)
+  List_iterator<TABLE_LIST> ti(leaves);
+  TABLE_LIST *table_list;
+  while((table_list= ti++))
   {
-    if (leaves_tmp->belong_to_view && 
+    if (table_list->belong_to_view && !table_list->view && 
         check_single_table_access(thd, first_table ? want_access_first :
-                                  want_access, leaves_tmp, FALSE))
+                                  want_access, table_list, FALSE))
     {
       tables->hide_view_error(thd);
-      return TRUE;
+      DBUG_RETURN(TRUE);
     }
     first_table= 0;
   }
-  return FALSE;
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -8187,8 +8276,10 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name,
        information_schema table, or a nested table reference. See the comment
        for TABLE_LIST.
     */
-    if (!((table && !tables->view && (table->grant.privilege & SELECT_ACL)) ||
-          (tables->view && (tables->grant.privilege & SELECT_ACL))) &&
+    if (!((table && tables->is_non_derived() &&
+          (table->grant.privilege & SELECT_ACL)) ||
+	  ((!tables->is_non_derived() && 
+	    (tables->grant.privilege & SELECT_ACL)))) &&
         !any_privileges)
     {
       field_iterator.set(tables);
@@ -8218,7 +8309,7 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name,
 
       if (!(item= field_iterator.create_item(thd)))
         DBUG_RETURN(TRUE);
-      DBUG_ASSERT(item->fixed);
+//      DBUG_ASSERT(item->fixed);
       /* cache the table for the Item_fields inserted by expanding stars */
       if (item->type() == Item::FIELD_ITEM && tables->cacheable_table)
         ((Item_field *)item)->cached_table= tables;
@@ -8330,6 +8421,29 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name,
 }
 
 
+/**
+  Wrap Item_ident
+
+  @param thd             thread handle
+  @param conds           pointer to the condition which should be wrapped
+*/
+
+void wrap_ident(THD *thd, Item **conds)
+{
+  Item_direct_ref_to_ident *wrapper;
+  DBUG_ASSERT((*conds)->type() == Item::FIELD_ITEM || (*conds)->type() == Item::REF_ITEM);
+  Query_arena *arena= thd->stmt_arena, backup;
+  if (arena->is_conventional())
+    arena= 0;
+  else
+    thd->set_n_backup_active_arena(arena, &backup);
+  if ((wrapper= new Item_direct_ref_to_ident((Item_ident *)(*conds))))
+    (*conds)= (Item*) wrapper;
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+}
+
+
 /*
   Fix all conditions and outer join expressions.
 
@@ -8348,12 +8462,13 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name,
     FALSE if all is OK
 */
 
-int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
+int setup_conds(THD *thd, TABLE_LIST *tables, List<TABLE_LIST> &leaves,
                 COND **conds)
 {
   SELECT_LEX *select_lex= thd->lex->current_select;
   TABLE_LIST *table= NULL;	// For HP compilers
   TABLE_LIST *save_emb_on_expr_nest= thd->thd_marker.emb_on_expr_nest;
+  List_iterator<TABLE_LIST> ti(leaves);
   /*
     it_is_update set to TRUE when tables of primary SELECT_LEX (SELECT_LEX
     which belong to LEX, i.e. most up SELECT) will be updated by
@@ -8365,9 +8480,15 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
   bool it_is_update= (select_lex == &thd->lex->select_lex) &&
     thd->lex->which_check_option_applicable();
   bool save_is_item_list_lookup= select_lex->is_item_list_lookup;
-  select_lex->is_item_list_lookup= 0;
+  TABLE_LIST *derived= select_lex->master_unit()->derived;
   DBUG_ENTER("setup_conds");
 
+  /* Do not fix conditions for the derived tables that have been merged */
+  if (derived && derived->merged)
+    DBUG_RETURN(0);
+
+  select_lex->is_item_list_lookup= 0;
+
   thd->mark_used_columns= MARK_COLUMNS_READ;
   DBUG_PRINT("info", ("thd->mark_used_columns: %d", thd->mark_used_columns));
   select_lex->cond_count= 0;
@@ -8376,11 +8497,14 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
 
   for (table= tables; table; table= table->next_local)
   {
-    if (table->prepare_where(thd, conds, FALSE))
+    if (select_lex == &thd->lex->select_lex &&
+        select_lex->first_cond_optimization &&
+        table->merged_for_insert &&
+        table->prepare_where(thd, conds, FALSE))
       goto err_no_arena;
   }
 
-  thd->thd_marker.emb_on_expr_nest= (TABLE_LIST*)1;
+  thd->thd_marker.emb_on_expr_nest= NO_JOIN_NEST;
   if (*conds)
   {
     thd->where="where clause";
@@ -8388,6 +8512,12 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
                  print_where(*conds,
                              "WHERE in setup_conds",
                              QT_ORDINARY););
+    /*
+      Wrap alone field in WHERE clause in case it will be outer field of subquery
+      which need persistent pointer on it, but conds could be changed by optimizer
+    */
+    if ((*conds)->type() == Item::FIELD_ITEM && !derived)
+      wrap_ident(thd, conds);
     if ((!(*conds)->fixed && (*conds)->fix_fields(thd, conds)) ||
 	(*conds)->check_cols(1))
       goto err_no_arena;
@@ -8398,7 +8528,7 @@ int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
     Apply fix_fields() to all ON clauses at all levels of nesting,
     including the ones inside view definitions.
   */
-  for (table= leaves; table; table= table->next_leaf)
+  while ((table= ti++))
   {
     TABLE_LIST *embedded; /* The table at the current level of nesting. */
     TABLE_LIST *embedding= table; /* The parent nested table reference. */
@@ -8487,11 +8617,9 @@ fill_record(THD * thd, List<Item> &fields, List<Item> &values,
   List_iterator_fast<Item> f(fields),v(values);
   Item *value, *fld;
   Item_field *field;
-  TABLE *table= 0;
-  List<TABLE> tbl_list;
+  TABLE *table= 0, *vcol_table= 0;
   bool abort_on_warning_saved= thd->abort_on_warning;
   DBUG_ENTER("fill_record");
-  tbl_list.empty();
 
   /*
     Reset the table->auto_increment_field_not_null as it is valid for
@@ -8514,7 +8642,7 @@ fill_record(THD * thd, List<Item> &fields, List<Item> &values,
     f.rewind();
   }
   else if (thd->lex->unit.insert_table_with_stored_vcol)
-    tbl_list.push_back(thd->lex->unit.insert_table_with_stored_vcol);
+    vcol_table= thd->lex->unit.insert_table_with_stored_vcol;
   while ((fld= f++))
   {
     if (!(field= fld->filed_for_view_update()))
@@ -8532,43 +8660,27 @@ fill_record(THD * thd, List<Item> &fields, List<Item> &values,
         value->type() != Item::NULL_ITEM &&
         table->s->table_category != TABLE_CATEGORY_TEMPORARY)
     {
-      thd->abort_on_warning= FALSE;
       push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                           ER_WARNING_NON_DEFAULT_VALUE_FOR_VIRTUAL_COLUMN,
                           ER(ER_WARNING_NON_DEFAULT_VALUE_FOR_VIRTUAL_COLUMN),
                           rfield->field_name, table->s->table_name.str);
-      thd->abort_on_warning= abort_on_warning_saved;
     }
     if ((value->save_in_field(rfield, 0) < 0) && !ignore_errors)
     {
       my_message(ER_UNKNOWN_ERROR, ER(ER_UNKNOWN_ERROR), MYF(0));
       goto err;
     }
-    tbl_list.push_back(table);
+    DBUG_ASSERT(vcol_table == 0 || vcol_table == table);
+    vcol_table= table;
   }
   /* Update virtual fields*/
   thd->abort_on_warning= FALSE;
-  if (tbl_list.head())
+  if (vcol_table)
   {
-    List_iterator_fast<TABLE> it(tbl_list);
-    TABLE *prev_table= 0;
-    while ((table= it++))
+    if (vcol_table->vfield)
     {
-      /*
-        Do simple optimization to prevent unnecessary re-generating 
-        values for virtual fields
-      */
-      if (table != prev_table)
-      {
-        prev_table= table;
-        if (table->vfield)
-        {
-          if (update_virtual_fields(thd, table, TRUE))
-          {
-            goto err;
-          }
-        }
-      }
+      if (update_virtual_fields(thd, vcol_table, TRUE))
+        goto err;
     }
   }
   thd->abort_on_warning= abort_on_warning_saved;
@@ -8667,28 +8779,33 @@ fill_record(THD *thd, Field **ptr, List<Item> &values, bool ignore_errors,
   List<TABLE> tbl_list;
   Item *value;
   TABLE *table= 0;
+  Field *field;
   bool abort_on_warning_saved= thd->abort_on_warning;
   DBUG_ENTER("fill_record");
 
-  Field *field;
-  tbl_list.empty();
+  if (!*ptr)
+  {
+    /* No fields to update, quite strange!*/
+    DBUG_RETURN(0);
+  }
+
+  /*
+    On INSERT or UPDATE fields are checked to be from the same table,
+    thus we safely can take table from the first field.
+  */
+  table= (*ptr)->table;
+
   /*
     Reset the table->auto_increment_field_not_null as it is valid for
     only one row.
   */
-  if (*ptr)
-  {
-    /*
-      On INSERT or UPDATE fields are checked to be from the same table,
-      thus we safely can take table from the first field.
-    */
-    table= (*ptr)->table;
-    table->auto_increment_field_not_null= FALSE;
-  }
+  table->auto_increment_field_not_null= FALSE;
   while ((field = *ptr++) && ! thd->is_error())
   {
+    /* Ensure that all fields are from the same table */
+    DBUG_ASSERT(field->table == table);
+
     value=v++;
-    table= field->table;
     if (field == table->next_number_field)
       table->auto_increment_field_not_null= TRUE;
     if (field->vcol_info && 
@@ -8696,52 +8813,28 @@ fill_record(THD *thd, Field **ptr, List<Item> &values, bool ignore_errors,
         value->type() != Item::NULL_ITEM &&
         table->s->table_category != TABLE_CATEGORY_TEMPORARY)
     {
-      thd->abort_on_warning= FALSE;
       push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                           ER_WARNING_NON_DEFAULT_VALUE_FOR_VIRTUAL_COLUMN,
                           ER(ER_WARNING_NON_DEFAULT_VALUE_FOR_VIRTUAL_COLUMN),
                           field->field_name, table->s->table_name.str);
-      thd->abort_on_warning= abort_on_warning_saved;
     }
+
     if (use_value)
       value->save_val(field);
     else
       if (value->save_in_field(field, 0) < 0)
         goto err;
-    tbl_list.push_back(table);
   }
   /* Update virtual fields*/
   thd->abort_on_warning= FALSE;
-  if (tbl_list.head())
-  {
-    List_iterator_fast<TABLE> t(tbl_list);
-    TABLE *prev_table= 0;
-    while ((table= t++))
-    {
-      /*
-        Do simple optimization to prevent unnecessary re-generating 
-        values for virtual fields
-      */
-      if (table != prev_table)
-      {
-        prev_table= table;
-        if (table->vfield)
-        {
-          if (update_virtual_fields(thd, table, TRUE))
-          {
-            goto err;
-          }
-        }
-      }
-    }
-  }
+  if (table->vfield && update_virtual_fields(thd, table, TRUE))
+    goto err;
   thd->abort_on_warning= abort_on_warning_saved;
   DBUG_RETURN(thd->is_error());
 
 err:
   thd->abort_on_warning= abort_on_warning_saved;
-  if (table)
-    table->auto_increment_field_not_null= FALSE;
+  table->auto_increment_field_not_null= FALSE;
   DBUG_RETURN(TRUE);
 }
 
@@ -9278,12 +9371,6 @@ close_system_tables(THD *thd, Open_tables_backup *backup)
   held by the connection due to a preceding implicit
   commit.
 
-  This function assumes that there is no
-  statement transaction started for the operation
-  itself, since mysql.* tables are not transactional
-  and when they are used the binlog is off (DDL
-  binlogging is always statement-based.
-
   We need this function since we'd like to not
   just close the system table, but also release
   the metadata lock on it.
@@ -9297,8 +9384,8 @@ close_system_tables(THD *thd, Open_tables_backup *backup)
 void
 close_mysql_tables(THD *thd)
 {
-  /* No need to commit/rollback statement transaction, it's not started. */
-  DBUG_ASSERT(thd->transaction.stmt.is_empty());
+  if (! thd->in_sub_stmt)
+    trans_commit_stmt(thd);
   close_thread_tables(thd);
   thd->mdl_context.release_transactional_locks();
 }
@@ -9392,6 +9479,61 @@ void close_log_table(THD *thd, Open_tables_backup *backup)
   close_system_tables(thd, backup);
 }
 
+
+/**
+  @brief
+  Remove 'fixed' flag from items in a list
+
+  @param items list of items to un-fix
+
+  @details
+  This function sets to 0 the 'fixed' flag for items in the 'items' list.
+  It's needed to force correct marking of views' fields for INSERT/UPDATE
+  statements.
+*/
+
+void unfix_fields(List<Item> &fields)
+{
+  List_iterator<Item> li(fields);
+  Item *item;
+  while ((item= li++))
+    item->fixed= 0;
+}
+
+
+/**
+  Check result of dynamic column function and issue error if it is needed
+
+  @param rc              The result code of dynamic column function
+
+  @return the result code which was get as an argument\
+*/
+
+int dynamic_column_error_message(enum_dyncol_func_result rc)
+{
+  switch (rc) {
+  case ER_DYNCOL_YES:
+  case ER_DYNCOL_OK:
+    break; // it is not an error
+  case ER_DYNCOL_FORMAT:
+    my_error(ER_DYN_COL_WRONG_FORMAT, MYF(0));
+    break;
+  case ER_DYNCOL_LIMIT:
+    my_error(ER_DYN_COL_IMPLEMENTATION_LIMIT, MYF(0));
+    break;
+  case ER_DYNCOL_RESOURCE:
+    my_error(ER_OUT_OF_RESOURCES, MYF(0));
+    break;
+  case ER_DYNCOL_DATA:
+    my_error(ER_DYN_COL_DATA, MYF(0));
+    break;
+  case ER_DYNCOL_UNKNOWN_CHARSET:
+    my_error(ER_DYN_COL_WRONG_CHARSET, MYF(0));
+    break;
+  }
+  return rc;
+}
+
 /**
   @} (end of group Data_Dictionary)
 */
diff --git a/sql/sql_base.h b/sql/sql_base.h
index a79c1a28e39..e602aba98a9 100644
--- a/sql/sql_base.h
+++ b/sql/sql_base.h
@@ -127,6 +127,7 @@ TABLE *open_ltable(THD *thd, TABLE_LIST *table_list, thr_lock_type update,
   be open do not acquire global and schema-scope IX locks.
 */
 #define MYSQL_OPEN_SKIP_SCOPED_MDL_LOCK         0x1000
+#define MYSQL_LOCK_NOT_TEMPORARY		0x2000
 
 /** Please refer to the internals manual. */
 #define MYSQL_OPEN_REOPEN  (MYSQL_OPEN_IGNORE_FLUSH |\
@@ -182,11 +183,14 @@ bool fill_record_n_invoke_before_triggers(THD *thd, Field **field,
 bool insert_fields(THD *thd, Name_resolution_context *context,
 		   const char *db_name, const char *table_name,
                    List_iterator<Item> *it, bool any_privileges);
+void make_leaves_list(List<TABLE_LIST> &list, TABLE_LIST *tables,
+                      bool full_table_list, TABLE_LIST *boundary);
 int setup_wild(THD *thd, TABLE_LIST *tables, List<Item> &fields,
 	       List<Item> *sum_func_list, uint wild_num);
 bool setup_fields(THD *thd, Item** ref_pointer_array,
                   List<Item> &item, enum_mark_columns mark_used_columns,
                   List<Item> *sum_func_list, bool allow_sum_func);
+void unfix_fields(List<Item> &items);
 bool fill_record(THD *thd, Field **field, List<Item> &values,
                  bool ignore_errors, bool use_value);
 
@@ -213,15 +217,17 @@ Item ** find_item_in_list(Item *item, List<Item> &items, uint *counter,
                           enum_resolution_type *resolution);
 bool setup_tables(THD *thd, Name_resolution_context *context,
                   List<TABLE_LIST> *from_clause, TABLE_LIST *tables,
-                  TABLE_LIST **leaves, bool select_insert);
+                  List<TABLE_LIST> &leaves, bool select_insert,
+                  bool full_table_list);
 bool setup_tables_and_check_access(THD *thd,
                                    Name_resolution_context *context,
                                    List<TABLE_LIST> *from_clause,
                                    TABLE_LIST *tables,
-                                   TABLE_LIST **leaves,
+                                   List<TABLE_LIST> &leaves, 
                                    bool select_insert,
                                    ulong want_access_first,
-                                   ulong want_access);
+                                   ulong want_access,
+                                   bool full_table_list);
 bool wait_while_table_is_used(THD *thd, TABLE *table,
                               enum ha_extra_function function);
 
@@ -230,8 +236,9 @@ void drop_open_table(THD *thd, TABLE *table, const char *db_name,
 void update_non_unique_table_error(TABLE_LIST *update,
                                    const char *operation,
                                    TABLE_LIST *duplicate);
-int setup_conds(THD *thd, TABLE_LIST *tables, TABLE_LIST *leaves,
+int setup_conds(THD *thd, TABLE_LIST *tables, List<TABLE_LIST> &leaves,
 		COND **conds);
+void wrap_ident(THD *thd, Item **conds);
 int setup_ftfuncs(SELECT_LEX* select);
 int init_ftfuncs(THD *thd, SELECT_LEX* select, bool no_order);
 bool lock_table_names(THD *thd, TABLE_LIST *table_list,
@@ -247,7 +254,8 @@ bool open_and_lock_tables(THD *thd, TABLE_LIST *tables,
 TABLE *open_n_lock_single_table(THD *thd, TABLE_LIST *table_l,
                                 thr_lock_type lock_type, uint flags,
                                 Prelocking_strategy *prelocking_strategy);
-bool open_normal_and_derived_tables(THD *thd, TABLE_LIST *tables, uint flags);
+bool open_normal_and_derived_tables(THD *thd, TABLE_LIST *tables, uint flags,
+                                    uint dt_phases);
 bool lock_tables(THD *thd, TABLE_LIST *tables, uint counter, uint flags);
 int decide_logging_format(THD *thd, TABLE_LIST *tables);
 void free_io_cache(TABLE *entry);
@@ -296,6 +304,7 @@ TABLE *find_table_for_mdl_upgrade(THD *thd, const char *db,
 void mark_tmp_table_for_reuse(TABLE *table);
 bool check_if_table_exists(THD *thd, TABLE_LIST *table, bool *exists);
 int update_virtual_fields(THD *thd, TABLE *table, bool ignore_stored= FALSE);
+int dynamic_column_error_message(enum_dyncol_func_result rc);
 
 extern TABLE *unused_tables;
 extern Item **not_found_item;
@@ -315,7 +324,7 @@ extern HASH table_def_cache;
 inline void setup_table_map(TABLE *table, TABLE_LIST *table_list, uint tablenr)
 {
   table->used_fields= 0;
-  table->const_table= 0;
+  table_list->reset_const_table();
   table->null_row= 0;
   table->status= STATUS_NO_RECORD;
   table->maybe_null= table_list->outer_join;
@@ -331,6 +340,14 @@ inline void setup_table_map(TABLE *table, TABLE_LIST *table_list, uint tablenr)
   table->force_index_order= table->force_index_group= 0;
   table->covering_keys= table->s->keys_for_keyread;
   table->merge_keys.clear_all();
+  TABLE_LIST *orig= table_list->select_lex ?
+    table_list->select_lex->master_unit()->derived : 0;
+  if (!orig || !orig->is_merged_derived())
+  {
+    /* Tables merged from derived were set up already.*/
+    table->covering_keys= table->s->keys_for_keyread;
+    table->merge_keys.clear_all();
+  }
 }
 
 inline TABLE_LIST *find_table_in_global_list(TABLE_LIST *table,
diff --git a/sql/sql_binlog.cc b/sql/sql_binlog.cc
index f08125ca22f..75a1af02bd2 100644
--- a/sql/sql_binlog.cc
+++ b/sql/sql_binlog.cc
@@ -184,7 +184,7 @@ void mysql_client_binlog_statement(THD* thd)
       */
       if (!have_fd_event)
       {
-        int type = bufptr[EVENT_TYPE_OFFSET];
+        int type = (uchar)bufptr[EVENT_TYPE_OFFSET];
         if (type == FORMAT_DESCRIPTION_EVENT || type == START_EVENT_V3)
           have_fd_event= TRUE;
         else
@@ -196,7 +196,8 @@ void mysql_client_binlog_statement(THD* thd)
       }
 
       ev= Log_event::read_log_event(bufptr, event_len, &error,
-                                    rli->relay_log.description_event_for_exec);
+                                    rli->relay_log.description_event_for_exec,
+                                    0);
 
       DBUG_PRINT("info",("binlog base64 err=%s", error));
       if (!ev)
diff --git a/sql/sql_bitmap.h b/sql/sql_bitmap.h
index 0449611e27a..558240ce19d 100644
--- a/sql/sql_bitmap.h
+++ b/sql/sql_bitmap.h
@@ -95,6 +95,10 @@ public:
     DBUG_ASSERT(sizeof(buffer) >= 4);
     return (ulonglong) uint4korr(buffer);
   }
+  uint bits_set()
+  {
+    return bitmap_bits_set(&map);
+  }
 };
 
 /* An iterator to quickly walk over bits in unlonglong bitmap. */
@@ -164,6 +168,17 @@ public:
   public:
     Iterator(Bitmap<64> &bmp) : Table_map_iterator(bmp.map) {}
   };
+  uint bits_set()
+  {
+    //TODO: use my_count_bits()
+    uint res= 0, i= 0;
+    for (; i < 64 ; i++)
+    {
+      if (map & ((ulonglong)1<<i))
+        res++;
+    }
+    return res;
+  }
 };
 
 
diff --git a/sql/sql_cache.cc b/sql/sql_cache.cc
index feb9614efe7..256b2a874cb 100644
--- a/sql/sql_cache.cc
+++ b/sql/sql_cache.cc
@@ -345,6 +345,8 @@ TODO list:
 #include "log_slow.h"
 #include "transaction.h"
 
+const uchar *query_state_map;
+
 #ifdef EMBEDDED_LIBRARY
 #include "emb_qcache.h"
 #endif
@@ -430,6 +432,136 @@ struct Query_cache_wait_state
 };
 
 
+/*
+  Check if character is a white space.
+*/
+
+inline bool is_white_space(char c)
+{
+  return (query_state_map[(uint) ((uchar) c)] == MY_LEX_SKIP);
+}
+
+
+/**
+  Generate a query_string without query comments or duplicated space
+
+  @param new_query	    New query without 'fluff' is stored here
+  @param query		    Original query
+  @param query_length	    Length of original query
+  @param additional_length  Extra space for query cache we need to allocate
+  			    in new_query buffer.
+
+  Note:
+    If there is no space to allocate new_query, we will put original query
+    into new_query.
+*/
+
+static void make_base_query(String *new_query,
+                            const char *query, size_t query_length,
+                            size_t additional_length)
+{
+  char *buffer;
+  const char *query_end, *last_space;
+
+  /* The following is guaranteed by the query_cache interface */
+  DBUG_ASSERT(query[query_length] == 0);
+  DBUG_ASSERT(!is_white_space(query[0]));
+
+  if (new_query->realloc(query_length + additional_length))
+  {
+    /*
+      We could not allocate the query.  Use original query for
+      the query cache;  Better than nothing....
+    */
+    new_query->set(query, query_length, system_charset_info);
+    return;
+  }
+
+  buffer= (char*) new_query->ptr();             // Store base query here
+  query_end= query + query_length;
+  last_space= 0;                                // No space found yet
+
+  while (query < query_end)
+  {
+    char current = *(query++);
+    switch (current) {
+    case '\'':
+    case '`':
+    case '"':
+      *(buffer++)= current;                     // copy first quote
+      while (query < query_end)
+      {
+        *(buffer++)= *query;
+        if (*(query++) == current)              // found pair quote
+          break;
+      }
+      continue;                                 // Continue with next symbol
+    case '/':                                   // Start of comment ?
+      /*
+        Comment of format /#!number #/, must be skipped.
+        These may include '"' and other comments, but it should
+        be safe to parse the content as a normal string.
+      */
+      if (query[0] != '*' || query[1] == '!')
+        break;
+
+      query++;                               // skip "/"
+      while (++query < query_end)
+      {
+        if (query[0] == '*' && query[1] == '/')
+        {
+          query+= 2;
+          goto insert_space;
+        }
+      }
+      continue;                                 // Will end outer loop
+    case '-':
+      if (*query != '-' || !is_white_space(query[1])) // Not a comment
+        break;
+      query++;                 // skip second "-", and go to search of "\n"
+      /* fall through */
+    case '#':
+      while (query < query_end)
+      {
+        if (*(query++) == '\n')
+          goto insert_space;
+      }
+      continue;                                 // Will end outer loop
+    default:
+      if (is_white_space(current))
+        goto insert_space;
+      break;
+    }
+    *(buffer++)= current;
+    continue;
+
+insert_space:
+    if (buffer != last_space)
+    {
+      *(buffer++)= ' ';
+      last_space= buffer;
+    }
+  }
+  if (buffer == last_space)
+    buffer--;                                   // Remove the last space
+  *buffer= 0;
+  new_query->length((size_t) (buffer - new_query->ptr()));
+}
+
+
+/**
+  Check and change local variable if global one is switched
+
+  @param thd             thread handle
+*/
+
+void inline fix_local_query_cache_mode(THD *thd)
+{
+  if (global_system_variables.query_cache_type == 0)
+    thd->variables.query_cache_type= 0;
+}
+
+
 /**
   Serialize access to the query cache.
   If the lock cannot be granted the thread hangs in a conditional wait which
@@ -439,32 +571,42 @@ struct Query_cache_wait_state
   effect by another thread. This enables a quick path in execution to skip waits
   when the outcome is known.
 
-  @param use_timeout TRUE if the lock can abort because of a timeout.
+  @param mode TIMEOUT the lock can abort because of a timeout
+              TRY the lock can abort because it is locked now
+              WAIT wait for lock (default)
 
-  @note use_timeout is optional and default value is FALSE.
+  @note mode is optional and default value is WAIT.
 
   @return
    @retval FALSE An exclusive lock was taken
    @retval TRUE The locking attempt failed
 */
 
-bool Query_cache::try_lock(bool use_timeout)
+bool Query_cache::try_lock(THD *thd, Cache_try_lock_mode mode)
 {
-  bool interrupt= FALSE;
-  THD *thd= current_thd;
+  bool interrupt= TRUE;
   Query_cache_wait_state wait_state(thd, __func__, __FILE__, __LINE__);
   DBUG_ENTER("Query_cache::try_lock");
 
   mysql_mutex_lock(&structure_guard_mutex);
+  DBUG_EXECUTE_IF("status_wait_query_cache_mutex_sleep", { sleep(5); });
+  if (m_cache_status == DISABLED)
+  {
+    mysql_mutex_unlock(&structure_guard_mutex);
+    DBUG_RETURN(TRUE);
+  }
+  m_requests_in_progress++;
+  fix_local_query_cache_mode(thd);
+
   while (1)
   {
     if (m_cache_lock_status == Query_cache::UNLOCKED)
     {
       m_cache_lock_status= Query_cache::LOCKED;
 #ifndef DBUG_OFF
-      if (thd)
-        m_cache_lock_thread_id= thd->thread_id;
+      m_cache_lock_thread_id= thd->thread_id;
 #endif
+      interrupt= FALSE;
       break;
     }
     else if (m_cache_lock_status == Query_cache::LOCKED_NO_WAIT)
@@ -473,7 +615,6 @@ bool Query_cache::try_lock(bool use_timeout)
         If query cache is protected by a LOCKED_NO_WAIT lock this thread
         should avoid using the query cache as it is being evicted.
       */
-      interrupt= TRUE;
       break;
     }
     else
@@ -483,24 +624,34 @@ bool Query_cache::try_lock(bool use_timeout)
         To prevent send_result_to_client() and query_cache_insert() from
         blocking execution for too long a timeout is put on the lock.
       */
-      if (use_timeout)
+      if (mode == WAIT)
+      {
+        mysql_cond_wait(&COND_cache_status_changed, &structure_guard_mutex);
+      }
+      else if (mode == TIMEOUT)
       {
         struct timespec waittime;
         set_timespec_nsec(waittime,(ulong)(50000000L));  /* Wait for 50 msec */
         int res= mysql_cond_timedwait(&COND_cache_status_changed,
                                       &structure_guard_mutex, &waittime);
         if (res == ETIMEDOUT)
-        {
-          interrupt= TRUE;
           break;
-        }
       }
       else
       {
-        mysql_cond_wait(&COND_cache_status_changed, &structure_guard_mutex);
+        /**
+          If we are here, then mode is == TRY and there was someone else using
+          the query cache. (m_cache_lock_status != Query_cache::UNLOCKED).
+          Signal that we didn't get a lock.
+        */
+        DBUG_ASSERT(m_requests_in_progress > 1);
+        DBUG_ASSERT(mode == TRY);
+        break;
       }
     }
   }
+  if (interrupt)
+    m_requests_in_progress--;
   mysql_mutex_unlock(&structure_guard_mutex);
 
   DBUG_RETURN(interrupt);
@@ -525,10 +676,12 @@ void Query_cache::lock_and_suspend(void)
   DBUG_ENTER("Query_cache::lock_and_suspend");
 
   mysql_mutex_lock(&structure_guard_mutex);
+  m_requests_in_progress++;
   while (m_cache_lock_status != Query_cache::UNLOCKED)
     mysql_cond_wait(&COND_cache_status_changed, &structure_guard_mutex);
   m_cache_lock_status= Query_cache::LOCKED_NO_WAIT;
 #ifndef DBUG_OFF
+  /* Here thd may not be set during shutdown */
   if (thd)
     m_cache_lock_thread_id= thd->thread_id;
 #endif
@@ -547,19 +700,19 @@ void Query_cache::lock_and_suspend(void)
   It is used by all methods which invalidates one or more tables.
  */
 
-void Query_cache::lock(void)
+void Query_cache::lock(THD *thd)
 {
-  THD *thd= current_thd;
   Query_cache_wait_state wait_state(thd, __func__, __FILE__, __LINE__);
   DBUG_ENTER("Query_cache::lock");
 
   mysql_mutex_lock(&structure_guard_mutex);
+  m_requests_in_progress++;
+  fix_local_query_cache_mode(thd);
   while (m_cache_lock_status != Query_cache::UNLOCKED)
     mysql_cond_wait(&COND_cache_status_changed, &structure_guard_mutex);
   m_cache_lock_status= Query_cache::LOCKED;
 #ifndef DBUG_OFF
-  if (thd)
-    m_cache_lock_thread_id= thd->thread_id;
+  m_cache_lock_thread_id= thd->thread_id;
 #endif
   mysql_mutex_unlock(&structure_guard_mutex);
 
@@ -576,6 +729,7 @@ void Query_cache::unlock(void)
   DBUG_ENTER("Query_cache::unlock");
   mysql_mutex_lock(&structure_guard_mutex);
 #ifndef DBUG_OFF
+  /* Thd may not be set in resize() at mysqld start */
   THD *thd= current_thd;
   if (thd)
     DBUG_ASSERT(m_cache_lock_thread_id == thd->thread_id);
@@ -585,6 +739,14 @@ void Query_cache::unlock(void)
   m_cache_lock_status= Query_cache::UNLOCKED;
   DBUG_PRINT("Query_cache",("Sending signal"));
   mysql_cond_signal(&COND_cache_status_changed);
+  DBUG_ASSERT(m_requests_in_progress > 0);
+  m_requests_in_progress--;
+  if (m_requests_in_progress == 0 && m_cache_status == DISABLE_REQUEST)
+  {
+    /* No clients => just free query cache */
+    free_cache();
+    m_cache_status= DISABLED;
+  }
   mysql_mutex_unlock(&structure_guard_mutex);
   DBUG_VOID_RETURN;
 }
@@ -601,25 +763,24 @@ void Query_cache::unlock(void)
    @retval FALSE No directive found.
 */
  
-static bool has_no_cache_directive(char *sql)
+static bool has_no_cache_directive(const char *sql)
 {
-  int i=0;
-  while (sql[i] == ' ')
-    ++i;
+  while (is_white_space(*sql))
+    sql++;
     
-  if (my_toupper(system_charset_info, sql[i])    == 'S' &&
-      my_toupper(system_charset_info, sql[i+1])  == 'Q' &&
-      my_toupper(system_charset_info, sql[i+2])  == 'L' &&
-      my_toupper(system_charset_info, sql[i+3])  == '_' &&
-      my_toupper(system_charset_info, sql[i+4])  == 'N' &&
-      my_toupper(system_charset_info, sql[i+5])  == 'O' &&
-      my_toupper(system_charset_info, sql[i+6])  == '_' &&
-      my_toupper(system_charset_info, sql[i+7])  == 'C' &&
-      my_toupper(system_charset_info, sql[i+8])  == 'A' &&
-      my_toupper(system_charset_info, sql[i+9])  == 'C' &&
-      my_toupper(system_charset_info, sql[i+10]) == 'H' &&
-      my_toupper(system_charset_info, sql[i+11]) == 'E' &&
-      my_toupper(system_charset_info, sql[i+12]) == ' ')
+  if (my_toupper(system_charset_info, sql[0])  == 'S' &&
+      my_toupper(system_charset_info, sql[1])  == 'Q' &&
+      my_toupper(system_charset_info, sql[2])  == 'L' &&
+      my_toupper(system_charset_info, sql[3])  == '_' &&
+      my_toupper(system_charset_info, sql[4])  == 'N' &&
+      my_toupper(system_charset_info, sql[5])  == 'O' &&
+      my_toupper(system_charset_info, sql[6])  == '_' &&
+      my_toupper(system_charset_info, sql[7])  == 'C' &&
+      my_toupper(system_charset_info, sql[8])  == 'A' &&
+      my_toupper(system_charset_info, sql[9])  == 'C' &&
+      my_toupper(system_charset_info, sql[10]) == 'H' &&
+      my_toupper(system_charset_info, sql[11]) == 'E' &&
+      my_isspace(system_charset_info, sql[12]))
     return TRUE;
   
   return FALSE;       
@@ -893,13 +1054,19 @@ Query_cache::insert(Query_cache_tls *query_cache_tls,
 {
   DBUG_ENTER("Query_cache::insert");
 
-  /* See the comment on double-check locking usage above. */
+  /* First we check if query cache is disable without doing a mutex lock */
   if (is_disabled() || query_cache_tls->first_query_block == NULL)
     DBUG_VOID_RETURN;
 
+  DBUG_ASSERT(current_thd);
+
   QC_DEBUG_SYNC("wait_in_query_cache_insert");
 
-  if (try_lock())
+  /*
+    Lock the cache with try_lock(). try_lock() will fail if
+    cache was disabled between the above test and lock.
+  */
+  if (try_lock(current_thd, Query_cache::WAIT))
     DBUG_VOID_RETURN;
 
   Query_cache_block *query_block = query_cache_tls->first_query_block;
@@ -957,7 +1124,7 @@ Query_cache::abort(Query_cache_tls *query_cache_tls)
   if (is_disabled() || query_cache_tls->first_query_block == NULL)
     DBUG_VOID_RETURN;
 
-  if (try_lock())
+  if (try_lock(current_thd, Query_cache::WAIT))
     DBUG_VOID_RETURN;
 
   /*
@@ -1008,7 +1175,7 @@ void Query_cache::end_of_result(THD *thd)
                      emb_count_querycache_size(thd), 0);
 #endif
 
-  if (try_lock())
+  if (try_lock(thd, Query_cache::WAIT))
     DBUG_VOID_RETURN;
 
   query_block= query_cache_tls->first_query_block;
@@ -1092,7 +1259,8 @@ Query_cache::Query_cache(ulong query_cache_limit_arg,
   :query_cache_size(0),
    query_cache_limit(query_cache_limit_arg),
    queries_in_cache(0), hits(0), inserts(0), refused(0),
-   total_blocks(0), lowmem_prunes(0), m_query_cache_is_disabled(FALSE),
+   total_blocks(0), lowmem_prunes(0),
+   m_cache_status(OK),
    min_allocation_unit(ALIGN_SIZE(min_allocation_unit_arg)),
    min_result_data_size(ALIGN_SIZE(min_result_data_size_arg)),
    def_query_hash_size(ALIGN_SIZE(def_query_hash_size_arg)),
@@ -1116,6 +1284,13 @@ ulong Query_cache::resize(ulong query_cache_size_arg)
 			query_cache_size_arg));
   DBUG_ASSERT(initialized);
 
+  if (global_system_variables.query_cache_type == 0)
+  {
+    if (query_cache_size_arg != 0)
+      my_error(ER_QUERY_CACHE_IS_DISABLED, MYF(0));
+    DBUG_RETURN(0);
+  }
+
   lock_and_suspend();
 
   /*
@@ -1148,8 +1323,17 @@ ulong Query_cache::resize(ulong query_cache_size_arg)
   query_cache_size= query_cache_size_arg;
   new_query_cache_size= init_cache();
 
+  /*
+    m_cache_status is internal query cache switch so switching it on/off
+    will not be reflected on global_system_variables.query_cache_type
+  */
   if (new_query_cache_size)
+  {
     DBUG_EXECUTE("check_querycache",check_integrity(1););
+    m_cache_status= OK;                         // size > 0 => enable cache
+  }
+  else
+    m_cache_status= DISABLED;                   // size 0 means the cache disabled
 
   unlock();
   DBUG_RETURN(new_query_cache_size);
@@ -1168,6 +1352,9 @@ void Query_cache::store_query(THD *thd, TABLE_LIST *tables_used)
 {
   TABLE_COUNTER_TYPE local_tables;
   ulong tot_length;
+  const char *query;
+  size_t query_length;
+  uint8 tables_type;
   DBUG_ENTER("Query_cache::store_query");
   /*
     Testing 'query_cache_size' without a lock here is safe: the thing
@@ -1177,12 +1364,23 @@ void Query_cache::store_query(THD *thd, TABLE_LIST *tables_used)
 
     See also a note on double-check locking usage above.
   */
-  if (thd->locked_tables_mode || query_cache_size == 0)
+  if (!thd->query_cache_is_applicable || query_cache_size == 0)
+  {
+    DBUG_PRINT("qcache", ("Query cache not ready"));
     DBUG_VOID_RETURN;
-  uint8 tables_type= 0;
+  }
+  if (thd->lex->sql_command != SQLCOM_SELECT)
+  {
+    DBUG_PRINT("qcache", ("Ignoring not SELECT command"));
+    DBUG_VOID_RETURN;
+  }
 
-  if ((local_tables= is_cacheable(thd, thd->query_length(),
-				  thd->query(), thd->lex, tables_used,
+  /* The following assert fails if we haven't called send_result_to_client */
+  DBUG_ASSERT(thd->base_query.is_alloced() ||
+              thd->base_query.ptr() == thd->query());
+
+  tables_type= 0;
+  if ((local_tables= is_cacheable(thd, thd->lex, tables_used,
 				  &tables_type)))
   {
     NET *net= &thd->net;
@@ -1261,7 +1459,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
       The 'TRUE' parameter indicate that the lock is allowed to timeout
 
     */
-    if (try_lock(TRUE))
+    if (try_lock(thd, Query_cache::WAIT))
       DBUG_VOID_RETURN;
     if (query_cache_size == 0)
     {
@@ -1277,11 +1475,13 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
       DBUG_VOID_RETURN;
     }
 
+    query=        thd->base_query.ptr();
+    query_length= thd->base_query.length();
+
     /* Key is query + database + flag */
     if (thd->db_length)
     {
-      memcpy(thd->query() + thd->query_length() + 1, thd->db, 
-        thd->db_length);
+      memcpy((char*) (query + query_length + 1), thd->db, thd->db_length);
       DBUG_PRINT("qcache", ("database: %s  length: %u",
 			    thd->db, (unsigned) thd->db_length)); 
     }
@@ -1289,24 +1489,24 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
     {
       DBUG_PRINT("qcache", ("No active database"));
     }
-    tot_length= thd->query_length() + thd->db_length + 1 +
+    tot_length= query_length + thd->db_length + 1 +
       QUERY_CACHE_FLAGS_SIZE;
     /*
       We should only copy structure (don't use it location directly)
       because of alignment issue
     */
-    memcpy((void*) (thd->query() + (tot_length - QUERY_CACHE_FLAGS_SIZE)),
+    memcpy((void*) (query + (tot_length - QUERY_CACHE_FLAGS_SIZE)),
 	   &flags, QUERY_CACHE_FLAGS_SIZE);
 
     /* Check if another thread is processing the same query? */
     Query_cache_block *competitor = (Query_cache_block *)
-      my_hash_search(&queries, (uchar*) thd->query(), tot_length);
+      my_hash_search(&queries, (uchar*) query, tot_length);
     DBUG_PRINT("qcache", ("competitor 0x%lx", (ulong) competitor));
     if (competitor == 0)
     {
       /* Query is not in cache and no one is working with it; Store it */
       Query_cache_block *query_block;
-      query_block= write_block_data(tot_length, (uchar*) thd->query(),
+      query_block= write_block_data(tot_length, (uchar*) query,
 				    ALIGN_SIZE(sizeof(Query_cache_query)),
 				    Query_cache_block::QUERY, local_tables);
       if (query_block != 0)
@@ -1363,7 +1563,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
       DBUG_PRINT("qcache", ("Another thread process same query"));
     }
   }
-  else if (thd->lex->sql_command == SQLCOM_SELECT)
+  else
     statistic_increment(refused, &structure_guard_mutex);
 
 end:
@@ -1427,7 +1627,7 @@ send_data_in_chunks(NET *net, const uchar *packet, ulong len)
   to the user.
 
   @param thd Pointer to the thread handler
-  @param sql A pointer to the sql statement *
+  @param org_sql A pointer to the sql statement *
   @param query_length Length of the statement in characters
 
   @return status code
@@ -1442,7 +1642,7 @@ send_data_in_chunks(NET *net, const uchar *packet, ulong len)
 */
 
 int
-Query_cache::send_result_to_client(THD *thd, char *sql, uint query_length)
+Query_cache::send_result_to_client(THD *thd, char *org_sql, uint query_length)
 {
   ulonglong engine_data;
   Query_cache_query *query;
@@ -1453,6 +1653,7 @@ Query_cache::send_result_to_client(THD *thd, char *sql, uint query_length)
   Query_cache_block_table *block_table, *block_table_end;
   ulong tot_length;
   Query_cache_query_flags flags;
+  const char *sql, *sql_end;
   DBUG_ENTER("Query_cache::send_result_to_client");
 
   /*
@@ -1463,50 +1664,95 @@ Query_cache::send_result_to_client(THD *thd, char *sql, uint query_length)
     See also a note on double-check locking usage above.
   */
   if (is_disabled() || thd->locked_tables_mode ||
-      thd->variables.query_cache_type == 0 || query_cache_size == 0)
+      thd->variables.query_cache_type == 0)
     goto err;
+  DBUG_ASSERT(query_cache_size != 0);           // otherwise cache would be disabled
 
-  if (!thd->lex->safe_to_cache_query)
+  thd->query_cache_is_applicable= 1;
+  sql= org_sql; sql_end= sql + query_length;
+
+  /*
+    Skip all comments at start of query. The following tests is false for
+    all normal queries.
+  */
+  if (!my_isalpha(system_charset_info, *sql))
+  {
+    while (sql < sql_end)
+    {
+      char current= *sql;
+      switch (current) {
+      case '/':
+        if (sql[1] != '*')
+          break;
+        sql+= 2;                              // Skip '/*'
+        if (*sql == '!')
+        {
+          /*
+            Found / *!number comment; Skip number to see if sql
+            starts with 'select'
+          */
+          sql++;
+          while (my_isdigit(system_charset_info, *sql))
+            sql++;
+        }
+        else
+        {
+          while (sql++ < sql_end)
+          {
+            if (sql[-1] == '*' && *sql == '/')
+            {
+              sql++;
+              break;
+            }
+          }
+        }
+        continue;
+      case '-':
+        if (sql[1] != '-' || !is_white_space(sql[2])) // Not a comment
+          break;
+        sql++;                               // Skip first '-'
+        /* Fall through */
+      case '#':
+        while (++sql < sql_end)
+        {
+          if (*sql == '\n')
+          {
+            sql++;                            // Skip '\n'
+            break;
+          }
+        }
+        /* Continue with analyzing current symbol */
+        continue;
+      case '\r':
+      case '\n':
+      case '\t':
+      case ' ':
+      case '(':    // To handle (select a from t1) union (select a from t1);
+        sql++;
+        continue;
+      default:
+        break;
+      }
+      /* We only come here when we found the first word of the sql */
+      break;
+    }
+  }
+  if ((my_toupper(system_charset_info, sql[0]) != 'S' ||
+       my_toupper(system_charset_info, sql[1]) != 'E' ||
+       my_toupper(system_charset_info, sql[2]) != 'L'))
   {
-    DBUG_PRINT("qcache", ("SELECT is non-cacheable"));
+    DBUG_PRINT("qcache", ("The statement is not a SELECT; Not cached"));
     goto err;
   }
 
+  if ((sql_end - sql) > 20 && has_no_cache_directive(sql+6))
   {
-    uint i= 0;
-    /*
-      Skip '(' characters in queries like following:
-      (select a from t1) union (select a from t1);
-    */
-    while (sql[i]=='(')
-      i++;
-
     /*
-      Test if the query is a SELECT
-      (pre-space is removed in dispatch_command).
-
-      First '/' looks like comment before command it is not
-      frequently appeared in real life, consequently we can
-      check all such queries, too.
+      We do not increase 'refused' statistics here since it will be done
+      later when the query is parsed.
     */
-    if ((my_toupper(system_charset_info, sql[i])     != 'S' ||
-         my_toupper(system_charset_info, sql[i + 1]) != 'E' ||
-         my_toupper(system_charset_info, sql[i + 2]) != 'L') &&
-        sql[i] != '/')
-    {
-      DBUG_PRINT("qcache", ("The statement is not a SELECT; Not cached"));
-      goto err;
-    }
-    
-    if (query_length > 20 && has_no_cache_directive(&sql[i+6]))
-    {
-      /*
-        We do not increase 'refused' statistics here since it will be done
-        later when the query is parsed.
-      */
-      DBUG_PRINT("qcache", ("The statement has a SQL_NO_CACHE directive"));
-      goto err;
-    }
+    DBUG_PRINT("qcache", ("The statement has a SQL_NO_CACHE directive"));
+    goto err;
   }
 
   /*
@@ -1514,20 +1760,32 @@ Query_cache::send_result_to_client(THD *thd, char *sql, uint query_length)
     disabled or if a full cache flush is in progress, the attempt to
     get the lock is aborted.
 
-    The 'TRUE' parameter indicate that the lock is allowed to timeout
+    The WAIT parameter indicate that the lock is allowed to timeout.
   */
-  if (try_lock(TRUE))
+  if (try_lock(thd, Query_cache::WAIT))
     goto err;
 
   if (query_cache_size == 0)
     goto err_unlock;
 
   Query_cache_block *query_block;
+  if (opt_query_cache_strip_comments)
+  {
+    make_base_query(&thd->base_query, sql, (size_t) (sql_end - sql),
+                    thd->db_length + 1 + QUERY_CACHE_FLAGS_SIZE);
+    sql=          thd->base_query.ptr();
+    query_length= thd->base_query.length();
+  }
+  else
+  {
+    sql= org_sql;
+    thd->base_query.set(sql, query_length, system_charset_info);
+  }
 
   tot_length= query_length + thd->db_length + 1 + QUERY_CACHE_FLAGS_SIZE;
   if (thd->db_length)
   {
-    memcpy(sql+query_length+1, thd->db, thd->db_length);
+    memcpy((char*) (sql+query_length+1), thd->db, thd->db_length);
     DBUG_PRINT("qcache", ("database: '%s'  length: %u",
 			  thd->db, (unsigned)thd->db_length));
   }
@@ -1658,7 +1916,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
           temporary tables => assign following variable to make check
           faster.
         */
-        thd->lex->safe_to_cache_query=0;
+        thd->query_cache_is_applicable= 0;      // Query can't be cached
         BLOCK_UNLOCK_RD(query_block);
         DBUG_RETURN(-1);
       }
@@ -1674,7 +1932,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
 		 ("probably no SELECT access to %s.%s =>  return to normal processing",
 		  table_list.db, table_list.alias));
       unlock();
-      thd->lex->safe_to_cache_query=0;		// Don't try to cache this
+      thd->query_cache_is_applicable= 0;        // Query can't be cached
       BLOCK_UNLOCK_RD(query_block);
       DBUG_RETURN(-1);				// Privilege error
     }
@@ -1683,7 +1941,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
       DBUG_PRINT("qcache", ("Need to check column privileges for %s.%s",
 			    table_list.db, table_list.alias));
       BLOCK_UNLOCK_RD(query_block);
-      thd->lex->safe_to_cache_query= 0;		// Don't try to cache this
+      thd->query_cache_is_applicable= 0;        // Query can't be cached
       goto err_unlock;				// Parse query
     }
 #endif /*!NO_EMBEDDED_ACCESS_CHECKS*/
@@ -1707,7 +1965,7 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
                                   table->key_length());
       }
       else
-        thd->lex->safe_to_cache_query= 0;       // Don't try to cache this
+        thd->query_cache_is_applicable= 0;      // Query can't be cached
       /* End the statement transaction potentially started by engine. */
       trans_rollback_stmt(thd);
       goto err_unlock;				// Parse query
@@ -1769,13 +2027,16 @@ def_week_frmt: %lu, in_trans: %d, autocommit: %d",
 
 err_unlock:
   unlock();
-err:
   MYSQL_QUERY_CACHE_MISS(thd->query());
   /*
     query_plan_flags doesn't have to be changed here as it contains
     QPLAN_QC_NO by default
   */
   DBUG_RETURN(0);				// Query was not cached
+
+err:
+  thd->query_cache_is_applicable= 0;            // Query can't be cached
+  DBUG_RETURN(0);				// Query was not cached
 }
 
 
@@ -1814,13 +2075,12 @@ void Query_cache::invalidate(THD *thd, TABLE_LIST *tables_used,
   DBUG_VOID_RETURN;
 }
 
-void Query_cache::invalidate(CHANGED_TABLE_LIST *tables_used)
+void Query_cache::invalidate(THD *thd, CHANGED_TABLE_LIST *tables_used)
 {
   DBUG_ENTER("Query_cache::invalidate (changed table list)");
   if (is_disabled())
     DBUG_VOID_RETURN;
 
-  THD *thd= current_thd;
   for (; tables_used; tables_used= tables_used->next)
   {
     thd_proc_info(thd, "invalidating query cache entries (table list)");
@@ -1843,13 +2103,13 @@ void Query_cache::invalidate(CHANGED_TABLE_LIST *tables_used)
   NOTE
     can be used only for opened tables
 */
-void Query_cache::invalidate_locked_for_write(TABLE_LIST *tables_used)
+void Query_cache::invalidate_locked_for_write(THD *thd,
+                                              TABLE_LIST *tables_used)
 {
   DBUG_ENTER("Query_cache::invalidate_locked_for_write");
   if (is_disabled())
     DBUG_VOID_RETURN;
 
-  THD *thd= current_thd;
   for (; tables_used; tables_used= tables_used->next_local)
   {
     thd_proc_info(thd, "invalidating query cache entries (table)");
@@ -1905,9 +2165,8 @@ void Query_cache::invalidate(THD *thd, const char *key, uint32  key_length,
    Remove all cached queries that uses the given database.
 */
 
-void Query_cache::invalidate(char *db)
+void Query_cache::invalidate(THD *thd, char *db)
 {
-  
   DBUG_ENTER("Query_cache::invalidate (db)");
   if (is_disabled())
     DBUG_VOID_RETURN;
@@ -1917,9 +2176,7 @@ void Query_cache::invalidate(char *db)
     Lock the query cache and queue all invalidation attempts to avoid
     the risk of a race between invalidation, cache inserts and flushes.
   */
-  lock();
-
-  THD *thd= current_thd;
+  lock(thd);
 
   if (query_cache_size > 0)
   {
@@ -2026,7 +2283,7 @@ void Query_cache::flush()
 
 */
 
-void Query_cache::pack(ulong join_limit, uint iteration_limit)
+void Query_cache::pack(THD *thd, ulong join_limit, uint iteration_limit)
 {
   DBUG_ENTER("Query_cache::pack");
 
@@ -2037,7 +2294,7 @@ void Query_cache::pack(ulong join_limit, uint iteration_limit)
     If the entire qc is being invalidated we can bail out early
     instead of waiting for the lock.
   */
-  if (try_lock())
+  if (try_lock(thd, Query_cache::WAIT))
     DBUG_VOID_RETURN;
 
   if (query_cache_size == 0)
@@ -2074,11 +2331,25 @@ void Query_cache::destroy()
     mysql_cond_destroy(&COND_cache_status_changed);
     mysql_mutex_destroy(&structure_guard_mutex);
     initialized = 0;
+    DBUG_ASSERT(m_requests_in_progress == 0);
   }
   DBUG_VOID_RETURN;
 }
 
 
+void Query_cache::disable_query_cache(THD *thd)
+{
+  m_cache_status= DISABLE_REQUEST;
+  /*
+    If there is no requests in progress try to free buffer.
+    try_lock(TRY) will exit immediately if there is lock.
+    unlock() should free block.
+  */
+  if (m_requests_in_progress == 0 && !try_lock(thd, TRY))
+    unlock();
+}
+
+
 /*****************************************************************************
   init/destroy
 *****************************************************************************/
@@ -2091,16 +2362,21 @@ void Query_cache::init()
   mysql_cond_init(key_COND_cache_status_changed,
                   &COND_cache_status_changed, NULL);
   m_cache_lock_status= Query_cache::UNLOCKED;
+  m_cache_status= Query_cache::OK;
+  m_requests_in_progress= 0;
   initialized = 1;
+  query_state_map= default_charset_info->state_map;
   /*
-    If we explicitly turn off query cache from the command line query cache will
-    be disabled for the reminder of the server life time. This is because we
-    want to avoid locking the QC specific mutex if query cache isn't going to
-    be used.
+    If we explicitly turn off query cache from the command line query
+    cache will be disabled for the reminder of the server life
+    time. This is because we want to avoid locking the QC specific
+    mutex if query cache isn't going to be used.
   */
   if (global_system_variables.query_cache_type == 0)
-    query_cache.disable_query_cache();
-
+  {
+    free_cache();
+    m_cache_status= DISABLED;
+  }
   DBUG_VOID_RETURN;
 }
 
@@ -2313,6 +2589,18 @@ void Query_cache::free_cache()
 {
   DBUG_ENTER("Query_cache::free_cache");
 
+  /* Destroy locks */
+  Query_cache_block *block= queries_blocks;
+  if (block)
+  {
+    do
+    {
+      Query_cache_query *query= block->query();
+      mysql_rwlock_destroy(&query->lock);
+      block= block->next;
+    } while (block != queries_blocks);
+  }
+
   my_free(cache);
   make_disabled();
   my_hash_free(&queries);
@@ -2792,7 +3080,7 @@ void Query_cache::invalidate_table(THD *thd, uchar * key, uint32  key_length)
     Lock the query cache and queue all invalidation attempts to avoid
     the risk of a race between invalidation, cache inserts and flushes.
   */
-  lock();
+  lock(thd);
 
   DEBUG_SYNC(thd, "wait_in_query_cache_invalidate2");
 
@@ -3570,7 +3858,7 @@ Query_cache::process_and_count_tables(THD *thd, TABLE_LIST *tables_used,
     {
       DBUG_PRINT("qcache", ("Don't cache statement as it refers to "
                             "tables with column privileges."));
-      thd->lex->safe_to_cache_query= 0;
+      thd->query_cache_is_applicable= 0;        // Query can't be cached
       DBUG_RETURN(0);
     }
 #endif
@@ -3583,16 +3871,17 @@ Query_cache::process_and_count_tables(THD *thd, TABLE_LIST *tables_used,
     }
     else
     {
-      DBUG_PRINT("qcache", ("table: %s  db:  %s  type: %u",
-                            tables_used->table->s->table_name.str,
-                            tables_used->table->s->db.str,
-                            tables_used->table->s->db_type()->db_type));
       if (tables_used->derived)
       {
+        DBUG_PRINT("qcache", ("table: %s", tables_used->alias));
         table_count--;
         DBUG_PRINT("qcache", ("derived table skipped"));
         continue;
       }
+      DBUG_PRINT("qcache", ("table: %s  db:  %s  type: %u",
+                            tables_used->table->s->table_name.str,
+                            tables_used->table->s->db.str,
+                            tables_used->table->s->db_type()->db_type));
       *tables_type|= tables_used->table->file->table_cache_type();
 
       /*
@@ -3635,14 +3924,13 @@ Query_cache::process_and_count_tables(THD *thd, TABLE_LIST *tables_used,
 */
 
 TABLE_COUNTER_TYPE
-Query_cache::is_cacheable(THD *thd, size_t query_len, const char *query,
-                          LEX *lex,
+Query_cache::is_cacheable(THD *thd, LEX *lex,
                           TABLE_LIST *tables_used, uint8 *tables_type)
 {
   TABLE_COUNTER_TYPE table_count;
   DBUG_ENTER("Query_cache::is_cacheable");
 
-  if (query_cache_is_cacheable_query(lex) &&
+  if (thd->lex->safe_to_cache_query &&
       (thd->variables.query_cache_type == 1 ||
        (thd->variables.query_cache_type == 2 && (lex->select_lex.options &
 						 OPTION_TO_QUERY_CACHE))))
@@ -3707,7 +3995,7 @@ my_bool Query_cache::ask_handler_allowance(THD *thd,
     {
       DBUG_PRINT("qcache", ("Handler does not allow caching for %s.%s",
 			    tables_used->db, tables_used->alias));
-      thd->lex->safe_to_cache_query= 0;          // Don't try to cache this
+      thd->query_cache_is_applicable= 0;        // Query can't be cached
       DBUG_RETURN(1);
     }
   }
diff --git a/sql/sql_cache.h b/sql/sql_cache.h
index 5a6c8e25c77..13efcfd85e9 100644
--- a/sql/sql_cache.h
+++ b/sql/sql_cache.h
@@ -293,14 +293,14 @@ private:
   my_thread_id m_cache_lock_thread_id;
 #endif
   mysql_cond_t COND_cache_status_changed;
+  uint m_requests_in_progress;
   enum Cache_lock_status { UNLOCKED, LOCKED_NO_WAIT, LOCKED };
   Cache_lock_status m_cache_lock_status;
-
-  bool m_query_cache_is_disabled;
+  enum Cache_staus {OK, DISABLE_REQUEST, DISABLED};
+  Cache_staus m_cache_status;
 
   void free_query_internal(Query_cache_block *point);
   void invalidate_table_internal(THD *thd, uchar *key, uint32 key_length);
-  void disable_query_cache(void) { m_query_cache_is_disabled= TRUE; }
 
 protected:
   /*
@@ -312,7 +312,7 @@ protected:
       2. query block (for operation inside query (query block/results))
 
     Thread doing cache flush releases the mutex once it sets
-    m_cache_status flag, so other threads may bypass the cache as
+    m_cache_lock_status flag, so other threads may bypass the cache as
     if it is disabled, not waiting for reset to finish.  The exception
     is other threads that were going to do cache flush---they'll wait
     till the end of a flush operation.
@@ -427,8 +427,7 @@ protected:
     If query is cacheable return number tables in query
     (query without tables not cached)
   */
-  TABLE_COUNTER_TYPE is_cacheable(THD *thd, size_t query_len,
-                                  const char *query,
+  TABLE_COUNTER_TYPE is_cacheable(THD *thd,
                                   LEX *lex, TABLE_LIST *tables_used,
                                   uint8 *tables_type);
   TABLE_COUNTER_TYPE process_and_count_tables(THD *thd,
@@ -444,7 +443,9 @@ protected:
 	      uint def_query_hash_size = QUERY_CACHE_DEF_QUERY_HASH_SIZE,
 	      uint def_table_hash_size = QUERY_CACHE_DEF_TABLE_HASH_SIZE);
 
-  bool is_disabled(void) { return m_query_cache_is_disabled; }
+  bool is_disabled(void) { return m_cache_status != OK; }
+  bool is_disable_in_progress(void)
+  { return m_cache_status == DISABLE_REQUEST; }
 
   /* initialize cache (mutex) */
   void init();
@@ -465,22 +466,23 @@ protected:
   int send_result_to_client(THD *thd, char *query, uint query_length);
 
   /* Remove all queries that uses any of the listed following tables */
-  void invalidate(THD* thd, TABLE_LIST *tables_used,
+  void invalidate(THD *thd, TABLE_LIST *tables_used,
 		  my_bool using_transactions);
-  void invalidate(CHANGED_TABLE_LIST *tables_used);
-  void invalidate_locked_for_write(TABLE_LIST *tables_used);
-  void invalidate(THD* thd, TABLE *table, my_bool using_transactions);
+  void invalidate(THD *thd, CHANGED_TABLE_LIST *tables_used);
+  void invalidate_locked_for_write(THD *thd, TABLE_LIST *tables_used);
+  void invalidate(THD *thd, TABLE *table, my_bool using_transactions);
   void invalidate(THD *thd, const char *key, uint32  key_length,
 		  my_bool using_transactions);
 
   /* Remove all queries that uses any of the tables in following database */
-  void invalidate(char *db);
+  void invalidate(THD *thd, char *db);
 
   /* Remove all queries that uses any of the listed following table */
   void invalidate_by_MyISAM_filename(const char *filename);
 
   void flush();
-  void pack(ulong join_limit = QUERY_CACHE_PACK_LIMIT,
+  void pack(THD *thd,
+            ulong join_limit = QUERY_CACHE_PACK_LIMIT,
 	    uint iteration_limit = QUERY_CACHE_PACK_ITERATION);
 
   void destroy();
@@ -511,10 +513,13 @@ protected:
 			const char *name);
   my_bool in_blocks(Query_cache_block * point);
 
-  bool try_lock(bool use_timeout= FALSE);
-  void lock(void);
+  enum Cache_try_lock_mode {WAIT, TIMEOUT, TRY};
+  bool try_lock(THD *thd, Cache_try_lock_mode mode= WAIT);
+  void lock(THD *thd);
   void lock_and_suspend(void);
   void unlock(void);
+
+  void disable_query_cache(THD *thd);
 };
 
 #ifdef HAVE_QUERY_CACHE
@@ -550,7 +555,7 @@ struct Query_cache_query_flags
 #define query_cache_resize(A) query_cache.resize(A)
 #define query_cache_set_min_res_unit(A) query_cache.set_min_res_unit(A)
 #define query_cache_invalidate3(A, B, C) query_cache.invalidate(A, B, C)
-#define query_cache_invalidate1(A) query_cache.invalidate(A)
+#define query_cache_invalidate1(A, B) query_cache.invalidate(A, B)
 #define query_cache_send_result_to_client(A, B, C) \
   query_cache.send_result_to_client(A, B, C)
 #define query_cache_invalidate_by_MyISAM_filename_ref \
@@ -562,20 +567,19 @@ struct Query_cache_query_flags
   (((L)->sql_command == SQLCOM_SELECT) && (L)->safe_to_cache_query)
 #else
 #define QUERY_CACHE_FLAGS_SIZE 0
-#define query_cache_store_query(A, B)
-#define query_cache_destroy()
-#define query_cache_result_size_limit(A)
-#define query_cache_init()
-#define query_cache_resize(A)
-#define query_cache_set_min_res_unit(A)
-#define query_cache_invalidate3(A, B, C)
-#define query_cache_invalidate1(A)
+#define query_cache_store_query(A, B)     do { } while(0)
+#define query_cache_destroy()             do { } while(0)
+#define query_cache_result_size_limit(A)  do { } while(0)
+#define query_cache_init()                do { } while(0)
+#define query_cache_resize(A)             do { } while(0)
+#define query_cache_set_min_res_unit(A)   do { } while(0)
+#define query_cache_invalidate3(A, B, C)  do { } while(0)
+#define query_cache_invalidate1(A,B)      do { } while(0)
 #define query_cache_send_result_to_client(A, B, C) 0
 #define query_cache_invalidate_by_MyISAM_filename_ref NULL
 
-#define query_cache_abort(A)
-#define query_cache_end_of_result(A)
-#define query_cache_invalidate_by_MyISAM_filename_ref NULL
+#define query_cache_abort(A)              do { } while(0)
+#define query_cache_end_of_result(A)      do { } while(0)
 #define query_cache_maybe_disabled(T) 1
 #define query_cache_is_cacheable_query(L) 0
 #endif /*HAVE_QUERY_CACHE*/
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 3a4c98410e1..2f15a3176f9 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -350,26 +350,6 @@ THD *thd_get_current_thd()
 }
 
 /**
-  Set up various THD data for a new connection
-
-  thd_new_connection_setup
-
-  @param              thd            THD object
-  @param              stack_start    Start of stack for connection
-*/
-void thd_new_connection_setup(THD *thd, char *stack_start)
-{
-  thd->set_time();
-  thd->prior_thr_create_utime= thd->thr_create_utime= thd->start_utime=
-    my_micro_time();
-  threads.append(thd);
-  thd_unlock_thread_count(thd);
-  DBUG_PRINT("info", ("init new connection. thd: 0x%lx fd: %d",
-          (ulong)thd, thd->net.vio->sd));
-  thd_set_thread_stack(thd, stack_start);
-}
-
-/**
   Lock data that needs protection in THD object
 
   @param thd                   THD object
@@ -510,13 +490,11 @@ int thd_tablespace_op(const THD *thd)
 
 
 extern "C"
-const char *set_thd_proc_info(void *thd_arg, const char *info,
+const char *set_thd_proc_info(THD *thd, const char *info,
                               const char *calling_function,
                               const char *calling_file,
                               const unsigned int calling_line)
 {
-  THD *thd= (THD *) thd_arg;
-
   if (!thd)
     thd= current_thd;
 
@@ -746,7 +724,7 @@ THD::THD()
    :Statement(&main_lex, &main_mem_root, STMT_CONVENTIONAL_EXECUTION,
               /* statement id */ 0),
    rli_fake(0),
-   user_time(0), in_sub_stmt(0),
+   in_sub_stmt(0),
    binlog_unsafe_warning_flags(0),
    binlog_table_maps(0),
    table_map_for_update(0),
@@ -758,6 +736,7 @@ THD::THD()
    examined_row_count(0),
    warning_info(&main_warning_info),
    stmt_da(&main_da),
+   global_disable_checkpoint(0),
    is_fatal_error(0),
    transaction_rollback_request(0),
    is_fatal_sub_stmt_error(0),
@@ -791,7 +770,7 @@ THD::THD()
   security_ctx= &main_security_ctx;
   no_errors= 0;
   password= 0;
-  query_start_used= 0;
+  query_start_used= query_start_sec_part_used= 0;
   count_cuted_fields= CHECK_FIELD_IGNORE;
   killed= NOT_KILLED;
   col_access=0;
@@ -806,9 +785,11 @@ THD::THD()
   statement_id_counter= 0UL;
   // Must be reset to handle error with THD's created for init of mysqld
   lex->current_select= 0;
-  start_time=(time_t) 0;
+  user_time.val= start_time= start_time_sec_part= 0;
   start_utime= prior_thr_create_utime= 0L;
   utime_after_lock= 0L;
+  progress.report_to_client= 0;
+  progress.max_counter= 0;
   current_linfo =  0;
   slave_thread = 0;
   bzero(&variables, sizeof(variables));
@@ -845,6 +826,8 @@ THD::THD()
   active_vio = 0;
 #endif
   mysql_mutex_init(key_LOCK_thd_data, &LOCK_thd_data, MY_MUTEX_INIT_FAST);
+  mysql_mutex_init(key_LOCK_wakeup_ready, &LOCK_wakeup_ready, MY_MUTEX_INIT_FAST);
+  mysql_cond_init(key_COND_wakeup_ready, &COND_wakeup_ready, 0);
 
   /* Variables with default values */
   proc_info="login";
@@ -892,6 +875,9 @@ THD::THD()
   arena_for_cached_items= 0;
   memset(&invoker_user, 0, sizeof(invoker_user));
   memset(&invoker_host, 0, sizeof(invoker_host));
+  prepare_derived_at_open= FALSE;
+  create_tmp_table_for_derived= FALSE;
+  save_prep_leaf_list= FALSE;
 }
 
 
@@ -1242,17 +1228,21 @@ void THD::update_stats(void)
 
 void THD::update_all_stats()
 {
-  time_t save_time;
   ulonglong end_cpu_time, end_utime;
   double busy_time, cpu_time;
 
+  /* Reset status variables used by information_schema.processlist */
+  progress.max_counter= 0;
+  progress.max_stage= 0;
+  progress.report= 0;
+
   /* This is set at start of query if opt_userstat_running was set */
   if (!userstat_running)
     return;
 
   end_cpu_time= my_getcputime();
-  end_utime=    my_micro_time_and_time(&save_time);
-  busy_time= (end_utime - start_utime)  / 1000000.0;
+  end_utime=    microsecond_interval_timer();
+  busy_time= (end_utime - start_utime) / 1000000.0;
   cpu_time=  (end_cpu_time - start_cpu_time) / 10000000.0;
   /* In case there are bad values, 2629743 is the #seconds in a month. */
   if (cpu_time > 2629743.0)
@@ -1260,7 +1250,8 @@ void THD::update_all_stats()
   status_var_add(status_var.cpu_time, cpu_time);
   status_var_add(status_var.busy_time, busy_time);
 
-  update_global_user_stats(this, TRUE, save_time);
+  update_global_user_stats(this, TRUE, my_time(0));
+  // Has to be updated after update_global_user_stats()
   userstat_running= 0;
 }
 
@@ -1354,6 +1345,11 @@ void THD::cleanup(void)
 
   /* All metadata locks must have been released by now. */
   DBUG_ASSERT(!mdl_context.has_locks());
+  if (user_connect)
+  {
+    decrease_user_connections(user_connect);
+    user_connect= 0;                            // Safety
+  }
   wt_thd_destroy(&transaction.wt);
 
 #if defined(ENABLED_DEBUG_SYNC)
@@ -1412,6 +1408,8 @@ THD::~THD()
   my_free(db);
   db= NULL;
   free_root(&transaction.mem_root,MYF(0));
+  mysql_cond_destroy(&COND_wakeup_ready);
+  mysql_mutex_destroy(&LOCK_wakeup_ready);
   mysql_mutex_destroy(&LOCK_thd_data);
 #ifndef DBUG_OFF
   dbug_sentry= THD_SENTRY_GONE;
@@ -1455,9 +1453,12 @@ void add_to_status(STATUS_VAR *to_var, STATUS_VAR *from_var)
     *(to++)+= *(from++);
 
   /* Handle the not ulong variables. See end of system_status_var */
-  to_var->bytes_received=       from_var->bytes_received;
+  to_var->bytes_received+=      from_var->bytes_received;
   to_var->bytes_sent+=          from_var->bytes_sent;
-  to_var->binlog_bytes_written= from_var->binlog_bytes_written;
+  to_var->rows_read+=           from_var->rows_read;
+  to_var->rows_sent+=           from_var->rows_sent;
+  to_var->rows_tmp_read+=       from_var->rows_tmp_read;
+  to_var->binlog_bytes_written+= from_var->binlog_bytes_written;
   to_var->cpu_time+=            from_var->cpu_time;
   to_var->busy_time+=           from_var->busy_time;
 }
@@ -1490,6 +1491,9 @@ void add_diff_to_status(STATUS_VAR *to_var, STATUS_VAR *from_var,
   to_var->bytes_received+=       from_var->bytes_received -
                                  dec_var->bytes_received;
   to_var->bytes_sent+=           from_var->bytes_sent - dec_var->bytes_sent;
+  to_var->rows_read+=            from_var->rows_read - dec_var->rows_read;
+  to_var->rows_sent+=            from_var->rows_sent - dec_var->rows_sent;
+  to_var->rows_tmp_read+=        from_var->rows_tmp_read - dec_var->rows_tmp_read;
   to_var->binlog_bytes_written+= from_var->binlog_bytes_written -
                                  dec_var->binlog_bytes_written;
   to_var->cpu_time+=             from_var->cpu_time - dec_var->cpu_time;
@@ -1597,12 +1601,35 @@ void THD::awake(THD::killed_state state_to_set)
       enter_cond(). This should make the signaling as safe as possible.
       However, there is still a small chance of failure on platforms with
       instruction or memory write reordering.
+
+      We have to do the loop with trylock, because if we would use
+      pthread_mutex_lock(), we can cause a deadlock as we are here locking
+      the mysys_var->mutex and mysys_var->current_mutex in a different order
+      than in the thread we are trying to kill.
+      We only sleep for 2 seconds as we don't want to have LOCK_thd_data
+      locked too long time.
+
+      There is a small change we may not succeed in aborting a thread that
+      is not yet waiting for a mutex, but as this happens only for a
+      thread that was doing something else when the kill was issued and
+      which should detect the kill flag before it starts to wait, this
+      should be good enough.
     */
     if (mysys_var->current_cond && mysys_var->current_mutex)
     {
-      mysql_mutex_lock(mysys_var->current_mutex);
-      mysql_cond_broadcast(mysys_var->current_cond);
-      mysql_mutex_unlock(mysys_var->current_mutex);
+      uint i;
+      for (i= 0; i < WAIT_FOR_KILL_TRY_TIMES * SECONDS_TO_WAIT_FOR_KILL; i++)
+      {
+        int ret= mysql_mutex_trylock(mysys_var->current_mutex);
+        mysql_cond_broadcast(mysys_var->current_cond);
+        if (!ret)
+        {
+          /* Signal is sure to get through */
+          mysql_mutex_unlock(mysys_var->current_mutex);
+          break;
+        }
+      }
+      my_sleep(1000000L / WAIT_FOR_KILL_TRY_TIMES);
     }
     mysql_mutex_unlock(&mysys_var->mutex);
   }
@@ -1724,6 +1751,9 @@ void THD::reset_globals()
 void THD::cleanup_after_query()
 {
   DBUG_ENTER("THD::cleanup_after_query");
+
+  thd_progress_end(this);
+
   /*
     Reset rand_used so that detection of calls to rand() will save random 
     seeds if needed by the slave.
@@ -2071,6 +2101,36 @@ void THD::nocheck_register_item_tree_change(Item **place, Item *old_value,
   change_list.append(change);
 }
 
+/**
+  Check and register item change if needed
+
+  @param place           place where we should assign new value
+  @param new_value       place of the new value
+
+  @details
+    Let C be a reference to an item that changed the reference A
+    at the location (occurrence) L1 and this change has been registered.
+    If C is substituted for reference A another location (occurrence) L2
+    that is to be registered as well than this change has to be
+    consistent with the first change in order the procedure that rollback
+    changes to substitute the same reference at both locations L1 and L2.
+*/
+
+void THD::check_and_register_item_tree_change(Item **place, Item **new_value,
+                                              MEM_ROOT *runtime_memroot)
+{
+  Item_change_record *change;
+  I_List_iterator<Item_change_record> it(change_list);
+  while ((change= it++))
+  {
+    if (change->place == new_value)
+      break; // we need only very first value
+  }
+  if (change)
+    nocheck_register_item_tree_change(place, change->old_value,
+                                      runtime_memroot);
+}
+
 
 void THD::rollback_item_tree_changes()
 {
@@ -2179,7 +2239,7 @@ void select_send::cleanup()
 
 /* Send data to client. Returns 0 if ok */
 
-bool select_send::send_data(List<Item> &items)
+int select_send::send_data(List<Item> &items)
 {
   Protocol *protocol= thd->protocol;
   DBUG_ENTER("select_send::send_data");
@@ -2466,7 +2526,7 @@ select_export::prepare(List<Item> &list, SELECT_LEX_UNIT *u)
                           (int) (uchar) (x) == line_sep_char  || \
                           !(x))
 
-bool select_export::send_data(List<Item> &items)
+int select_export::send_data(List<Item> &items)
 {
 
   DBUG_ENTER("select_export::send_data");
@@ -2672,7 +2732,6 @@ bool select_export::send_data(List<Item> &items)
     {						// Fill with space
       if (item->max_length > used_length)
       {
-	/* QQ:  Fix by adding a my_b_fill() function */
 	if (!space_inited)
 	{
 	  space_inited=1;
@@ -2724,7 +2783,7 @@ select_dump::prepare(List<Item> &list __attribute__((unused)),
 }
 
 
-bool select_dump::send_data(List<Item> &items)
+int select_dump::send_data(List<Item> &items)
 {
   List_iterator_fast<Item> li(items);
   char buff[MAX_FIELD_WIDTH];
@@ -2769,7 +2828,7 @@ select_subselect::select_subselect(Item_subselect *item_arg)
 }
 
 
-bool select_singlerow_subselect::send_data(List<Item> &items)
+int select_singlerow_subselect::send_data(List<Item> &items)
 {
   DBUG_ENTER("select_singlerow_subselect::send_data");
   Item_singlerow_subselect *it= (Item_singlerow_subselect *)item;
@@ -2800,7 +2859,7 @@ void select_max_min_finder_subselect::cleanup()
 }
 
 
-bool select_max_min_finder_subselect::send_data(List<Item> &items)
+int select_max_min_finder_subselect::send_data(List<Item> &items)
 {
   DBUG_ENTER("select_max_min_finder_subselect::send_data");
   Item_maxmin_subselect *it= (Item_maxmin_subselect *)item;
@@ -2832,6 +2891,8 @@ bool select_max_min_finder_subselect::send_data(List<Item> &items)
         op= &select_max_min_finder_subselect::cmp_decimal;
         break;
       case ROW_RESULT:
+      case TIME_RESULT:
+      case IMPOSSIBLE_RESULT:
         // This case should never be choosen
 	DBUG_ASSERT(0);
 	op= 0;
@@ -2903,7 +2964,7 @@ bool select_max_min_finder_subselect::cmp_str()
      sortcmp(val1, val2, cache->collation.collation) < 0);
 }
 
-bool select_exists_subselect::send_data(List<Item> &items)
+int select_exists_subselect::send_data(List<Item> &items)
 {
   DBUG_ENTER("select_exists_subselect::send_data");
   Item_exists_subselect *it= (Item_exists_subselect *)item;
@@ -2964,6 +3025,7 @@ void Query_arena::free_items()
   for (; free_list; free_list= next)
   {
     next= free_list->next;
+    DBUG_ASSERT(free_list != next);
     free_list->delete_self();
   }
   /* Postcondition: free_list is 0 */
@@ -3244,7 +3306,7 @@ Statement_map::~Statement_map()
   my_hash_free(&st_hash);
 }
 
-bool select_dumpvar::send_data(List<Item> &items)
+int select_dumpvar::send_data(List<Item> &items)
 {
   List_iterator_fast<my_var> var_li(var_list);
   List_iterator<Item> it(items);
@@ -3303,7 +3365,8 @@ bool
 select_materialize_with_stats::
 create_result_table(THD *thd_arg, List<Item> *column_types,
                     bool is_union_distinct, ulonglong options,
-                    const char *table_alias, bool bit_fields_as_long)
+                    const char *table_alias, bool bit_fields_as_long,
+                    bool create_table)
 {
   DBUG_ASSERT(table == 0);
   tmp_table_param.field_count= column_types->elements;
@@ -3349,18 +3412,26 @@ void select_materialize_with_stats::cleanup()
   @return FALSE on success
 */
 
-bool select_materialize_with_stats::send_data(List<Item> &items)
+int select_materialize_with_stats::send_data(List<Item> &items)
 {
   List_iterator_fast<Item> item_it(items);
   Item *cur_item;
   Column_statistics *cur_col_stat= col_stat;
   uint nulls_in_row= 0;
+  int res;
+
+  if ((res= select_union::send_data(items)))
+    return res;
+  /* Skip duplicate rows. */
+  if (write_err == HA_ERR_FOUND_DUPP_KEY ||
+      write_err == HA_ERR_FOUND_DUPP_UNIQUE)
+    return 0;
 
   ++count_rows;
 
   while ((cur_item= item_it++))
   {
-    if (cur_item->is_null())
+    if (cur_item->is_null_result())
     {
       ++cur_col_stat->null_count;
       cur_col_stat->max_null_row= count_rows;
@@ -3373,7 +3444,7 @@ bool select_materialize_with_stats::send_data(List<Item> &items)
   if (nulls_in_row > max_nulls_in_row)
     max_nulls_in_row= nulls_in_row;
 
-  return select_union::send_data(items);
+  return 0;
 }
 
 
@@ -3391,6 +3462,7 @@ void TMP_TABLE_PARAM::init()
   table_charset= 0;
   precomputed_group_by= 0;
   bit_fields_as_long= 0;
+  materialized_subquery= 0;
   skip_create_table= 0;
   DBUG_VOID_RETURN;
 }
@@ -3623,11 +3695,123 @@ void THD::restore_backup_open_tables_state(Open_tables_backup *backup)
   @retval 0 the user thread is active
   @retval 1 the user thread has been killed
 */
+
 extern "C" int thd_killed(const MYSQL_THD thd)
 {
   return(thd->killed);
 }
 
+
+/**
+   Send an out-of-band progress report to the client
+
+   The report is sent every 'thd->...progress_report_time' second,
+   however not more often than global.progress_report_time.
+   If global.progress_report_time is 0, then don't send progress reports, but
+   check every second if the value has changed
+*/
+
+static void thd_send_progress(THD *thd)
+{
+  /* Check if we should send the client a progress report */
+  ulonglong report_time= my_interval_timer();
+  if (report_time > thd->progress.next_report_time)
+  {
+    uint seconds_to_next= max(thd->variables.progress_report_time,
+                              global_system_variables.progress_report_time);
+    if (seconds_to_next == 0)             // Turned off
+      seconds_to_next= 1;                 // Check again after 1 second
+
+    thd->progress.next_report_time= (report_time +
+                                     seconds_to_next * 1000000000ULL);
+    if (global_system_variables.progress_report_time &&
+        thd->variables.progress_report_time)
+      net_send_progress_packet(thd);
+  }
+}
+
+
+/** Initialize progress report handling **/
+
+extern "C" void thd_progress_init(MYSQL_THD thd, uint max_stage)
+{
+  /*
+    Send progress reports to clients that supports it, if the command
+    is a high level command (like ALTER TABLE) and we are not in a
+    stored procedure
+  */
+  thd->progress.report= ((thd->client_capabilities & CLIENT_PROGRESS) &&
+                         thd->progress.report_to_client &&
+                         !thd->in_sub_stmt);
+  thd->progress.next_report_time= 0;
+  thd->progress.stage= 0;
+  thd->progress.counter= thd->progress.max_counter= 0;
+  thd->progress.max_stage= max_stage;
+}
+
+
+/* Inform processlist and the client that some progress has been made */
+
+extern "C" void thd_progress_report(MYSQL_THD thd,
+                                    ulonglong progress, ulonglong max_progress)
+{
+  if (thd->progress.max_counter != max_progress)        // Simple optimization
+  {
+    mysql_mutex_lock(&thd->LOCK_thd_data);
+    thd->progress.counter= progress;
+    thd->progress.max_counter= max_progress;
+    mysql_mutex_unlock(&thd->LOCK_thd_data);
+  }
+  else
+    thd->progress.counter= progress;
+
+  if (thd->progress.report)
+    thd_send_progress(thd);
+}
+
+/**
+  Move to next stage in process list handling
+
+  This will reset the timer to ensure the progress is sent to the client
+  if client progress reports are activated.
+*/
+
+extern "C" void thd_progress_next_stage(MYSQL_THD thd)
+{
+  mysql_mutex_lock(&thd->LOCK_thd_data);
+  thd->progress.stage++;
+  thd->progress.counter= 0;
+  DBUG_ASSERT(thd->progress.stage < thd->progress.max_stage);
+  mysql_mutex_unlock(&thd->LOCK_thd_data);
+  if (thd->progress.report)
+  {
+    thd->progress.next_report_time= 0;          // Send new stage info
+    thd_send_progress(thd);
+  }
+}
+
+/**
+  Disable reporting of progress in process list.
+
+  @note
+  This function is safe to call even if one has not called thd_progress_init.
+
+  This function should be called by all parts that does progress
+  reporting to ensure that progress list doesn't contain 100 % done
+  forever.
+*/
+
+
+extern "C" void thd_progress_end(MYSQL_THD thd)
+{
+  /*
+    It's enough to reset max_counter to set disable progress indicator
+    in processlist.
+  */
+  thd->progress.max_counter= 0;
+}
+
+
 /**
   Return the thread id of a user thread
   @param thd user thread
@@ -4990,8 +5174,8 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
                       bool suppress_use, int errcode)
 {
   DBUG_ENTER("THD::binlog_query");
-  DBUG_PRINT("enter", ("qtype: %s  query: '%s'",
-                       show_query_type(qtype), query_arg));
+  DBUG_PRINT("enter", ("qtype: %s  query: '%-.*s'",
+                       show_query_type(qtype), (int) query_len, query_arg));
   DBUG_ASSERT(query_arg && mysql_bin_log.is_open());
 
   /*
@@ -5034,7 +5218,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
       spcont == NULL && !binlog_evt_union.do_union)
     issue_unsafe_warnings();
 
-
   switch (qtype) {
     /*
       ROW_QUERY_TYPE means that the statement may be logged either in
@@ -5087,6 +5270,25 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
   DBUG_RETURN(0);
 }
 
+void
+THD::wait_for_wakeup_ready()
+{
+  mysql_mutex_lock(&LOCK_wakeup_ready);
+  while (!wakeup_ready)
+    mysql_cond_wait(&COND_wakeup_ready, &LOCK_wakeup_ready);
+  mysql_mutex_unlock(&LOCK_wakeup_ready);
+}
+
+void
+THD::signal_wakeup_ready()
+{
+  mysql_mutex_lock(&LOCK_wakeup_ready);
+  wakeup_ready= true;
+  mysql_mutex_unlock(&LOCK_wakeup_ready);
+  mysql_cond_signal(&COND_wakeup_ready);
+}
+
+
 bool Discrete_intervals_list::append(ulonglong start, ulonglong val,
                                  ulonglong incr)
 {
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 3f25f7cff9d..abaa7d4d9cb 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
+   2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -166,7 +167,7 @@ typedef struct st_user_var_events
 
 #define RP_LOCK_LOG_IS_ALREADY_LOCKED 1
 #define RP_FORCE_ROTATE               2
-
+#define RP_BINLOG_CHECKSUM_ALG_CHANGE 4
 /*
   The COPY_INFO structure is used by INSERT/REPLACE code.
   The schema of the row counting by the INSERT/INSERT ... ON DUPLICATE KEY
@@ -433,6 +434,9 @@ typedef struct system_variables
   ulonglong optimizer_switch;
   ulonglong sql_mode; ///< which non-standard SQL behaviour should be enabled
   ulonglong option_bits; ///< OPTION_xxx constants, e.g. OPTION_PROFILING
+  ulonglong join_buff_space_limit;
+  ulonglong log_slow_filter; 
+  ulonglong log_slow_verbosity; 
   ha_rows select_limit;
   ha_rows max_join_size;
   ulong auto_increment_increment, auto_increment_offset;
@@ -456,13 +460,6 @@ typedef struct system_variables
   ulong net_write_timeout;
   ulong optimizer_prune_level;
   ulong optimizer_search_depth;
-  /*
-    Controls use of Engine-MRR:
-      0 - auto, based on cost
-      1 - force MRR when the storage engine is capable of doing it
-      2 - disable MRR.
-  */
-  ulong optimizer_use_mrr;
   ulong preload_buff_size;
   ulong profiling_history_size;
   ulong read_buff_size;
@@ -484,9 +481,9 @@ typedef struct system_variables
   ulong group_concat_max_len;
   /* Flags for slow log filtering */
   ulong log_slow_rate_limit; 
-  ulong log_slow_filter; 
-  ulong log_slow_verbosity; 
   ulong binlog_format; ///< binlog format for this thd (see enum_binlog_format)
+  ulong progress_report_time;
+  my_bool binlog_annotate_rows_events;
   my_bool binlog_direct_non_trans_update;
   my_bool sql_log_bin;
   ulong completion_type;
@@ -501,11 +498,11 @@ typedef struct system_variables
   my_thread_id pseudo_thread_id;
 
   my_bool low_priority_updates;
-  my_bool new_mode;
   my_bool query_cache_wlock_invalidate;
   my_bool engine_condition_pushdown;
   my_bool keep_files_on_create;
 
+  my_bool old_mode;
   my_bool old_alter_table;
   my_bool old_passwords;
   my_bool big_tables;
@@ -569,6 +566,9 @@ typedef struct system_status_var
   ulong ha_rollback_count;
   ulong ha_update_count;
   ulong ha_write_count;
+  /* The following are for internal temporary tables */
+  ulong ha_tmp_update_count;
+  ulong ha_tmp_write_count;
   ulong ha_prepare_count;
   ulong ha_discover_count;
   ulong ha_savepoint_count;
@@ -593,8 +593,6 @@ typedef struct system_status_var
   ulong select_range_count;
   ulong select_range_check_count;
   ulong select_scan_count;
-  ulong rows_read;
-  ulong rows_sent;
   ulong long_query_count;
   ulong filesort_merge_passes;
   ulong filesort_range_count;
@@ -624,6 +622,9 @@ typedef struct system_status_var
   */
   ulonglong bytes_received;
   ulonglong bytes_sent;
+  ulonglong rows_read;
+  ulonglong rows_sent;
+  ulonglong rows_tmp_read;
   ulonglong binlog_bytes_written;
   double last_query_cost;
   double cpu_time, busy_time;
@@ -702,6 +703,8 @@ public:
   { return (int)state < (int)STMT_PREPARED; }
   inline bool is_stmt_prepare_or_first_stmt_execute() const
   { return (int)state <= (int)STMT_PREPARED; }
+  inline bool is_stmt_execute() const
+  { return state == STMT_PREPARED || state == STMT_EXECUTED; }
   inline bool is_conventional() const
   { return state == STMT_CONVENTIONAL_EXECUTION; }
 
@@ -796,6 +799,12 @@ public:
     ENGINE INNODB STATUS.
   */
   CSET_STRING query_string;
+  /*
+    If opt_query_cache_strip_comments is set, this contains query without
+    comments. If not set, it contains pointer to query_string.
+  */
+  String base_query;
+
 
   inline char *query() const { return query_string.str(); }
   inline uint32 query_length() const { return query_string.length(); }
@@ -829,7 +838,8 @@ public:
   char *db;
   size_t db_length;
 
-public:
+  /* This is set to 1 of last call to send_result_to_client() was ok */
+  my_bool query_cache_is_applicable;
 
   /* This constructor is called for backup statements */
   Statement() {}
@@ -1620,11 +1630,32 @@ public:
   uint32     file_id;			// for LOAD DATA INFILE
   /* remote (peer) port */
   uint16     peer_port;
-  time_t     start_time, user_time;
+  my_time_t  start_time;             // start_time and its sec_part 
+  ulong      start_time_sec_part;    // are almost always used separately
+  my_hrtime_t user_time;
   // track down slow pthread_create
   ulonglong  prior_thr_create_utime, thr_create_utime;
   ulonglong  start_utime, utime_after_lock;
 
+  // Process indicator
+  struct {
+    /*
+      true, if the currently running command can send progress report
+      packets to a client. Set by mysql_execute_command() for safe commands
+      See CF_REPORT_PROGRESS
+    */
+    bool       report_to_client;
+    /*
+      true, if we will send progress report packets to a client
+      (client has requested them, see CLIENT_PROGRESS; report_to_client
+      is true; not in sub-statement)
+    */
+    bool       report;
+    uint       stage, max_stage;
+    ulonglong  counter, max_counter;
+    ulonglong  next_report_time;
+  } progress;
+
   thr_lock_type update_lock_default;
   Delayed_insert *di;
 
@@ -1644,6 +1675,17 @@ public:
     */
     TABLE_LIST *emb_on_expr_nest;
   } thd_marker;
+
+  bool prepare_derived_at_open;
+
+  /* 
+    To signal that the tmp table to be created is created for materialized
+    derived table or a view.
+  */ 
+  bool create_tmp_table_for_derived;
+
+  bool save_prep_leaf_list;
+
 #ifndef MYSQL_CLIENT
   int binlog_setup_trx_data();
 
@@ -1652,7 +1694,8 @@ public:
   */
   void binlog_start_trans_and_stmt();
   void binlog_set_stmt_begin();
-  int binlog_write_table_map(TABLE *table, bool is_transactional);
+  int binlog_write_table_map(TABLE *table, bool is_transactional,
+                             my_bool *with_annotate= 0);
   int binlog_write_row(TABLE* table, bool is_transactional,
                        MY_BITMAP const* cols, size_t colcnt,
                        const uchar *buf);
@@ -1790,7 +1833,7 @@ public:
   /*
     This is to track items changed during execution of a prepared
     statement/stored procedure. It's created by
-    register_item_tree_change() in memory root of THD, and freed in
+    nocheck_register_item_tree_change() in memory root of THD, and freed in
     rollback_item_tree_changes(). For conventional execution it's always
     empty.
   */
@@ -2051,7 +2094,7 @@ public:
   ulong      query_plan_fsort_passes; 
   pthread_t  real_id;                           /* For debugging */
   my_thread_id  thread_id;
-  uint	     tmp_table;
+  uint	     tmp_table, global_disable_checkpoint;
   uint	     server_status,open_options;
   enum enum_thread_type system_thread;
   uint       select_number;             //number of select (used for EXPLAIN)
@@ -2137,6 +2180,7 @@ public:
   */
   bool       is_fatal_sub_stmt_error;
   bool	     query_start_used, rand_used, time_zone_used;
+  bool       query_start_sec_part_used;
   /* for IS NULL => = last_insert_id() fix in remove_eq_conds() */
   bool       substitute_null_with_insert_id;
   bool	     in_lock_tables;
@@ -2200,6 +2244,7 @@ public:
     long      long_value;
     ulong     ulong_value;
     ulonglong ulonglong_value;
+    double    double_value;
   } sys_var_tmp;
 
   struct {
@@ -2348,33 +2393,45 @@ public:
     mysql_mutex_unlock(&mysys_var->mutex);
     return;
   }
-  inline time_t query_start() { query_start_used=1; return start_time; }
-  inline void set_time()
+  inline my_time_t query_start() { query_start_used=1; return start_time; }
+  inline ulong query_start_sec_part()
+  { query_start_sec_part_used=1; return start_time_sec_part; }
+  inline void set_current_time()
   {
-    if (user_time)
+    my_hrtime_t hrtime= my_hrtime();
+    start_time= hrtime_to_my_time(hrtime);
+    start_time_sec_part= hrtime_sec_part(hrtime);
+  }
+  inline void set_start_time()
+  {
+    if (user_time.val)
     {
-      start_time= user_time;
-      start_utime= utime_after_lock= my_micro_time();
+      start_time= hrtime_to_my_time(user_time);
+      start_time_sec_part= hrtime_sec_part(user_time);
     }
     else
-      start_utime= utime_after_lock= my_micro_time_and_time(&start_time);
+      set_current_time();
   }
-  inline void	set_current_time()    { start_time= my_time(MY_WME); }
-  inline void	set_time(time_t t)
+  inline void set_time()
   {
-    start_time= user_time= t;
-    start_utime= utime_after_lock= my_micro_time();
+    set_start_time();
+    start_utime= utime_after_lock= microsecond_interval_timer();
   }
-  /*TODO: this will be obsolete when we have support for 64 bit my_time_t */
-  inline bool	is_valid_time() 
-  { 
-    return (IS_TIME_T_VALID_FOR_TIMESTAMP(start_time));
+  inline void	set_time(my_hrtime_t t)
+  {
+    user_time= t;
+    set_time();
+  }
+  inline void	set_time(my_time_t t, ulong sec_part)
+  {
+    my_hrtime_t hrtime= { hrtime_from_time(t) + sec_part };
+    set_time(hrtime);
   }
-  void set_time_after_lock()  { utime_after_lock= my_micro_time(); }
-  ulonglong current_utime()  { return my_micro_time(); }
+  void set_time_after_lock()  { utime_after_lock= microsecond_interval_timer(); }
+  ulonglong current_utime()  { return microsecond_interval_timer(); }
+
   /**
    Update server status after execution of a top level statement.
-
    Currently only checks if a query was slow, and assigns
    the status accordingly.
    Evaluate the current time, and if it exceeds the long-query-time
@@ -2562,8 +2619,30 @@ public:
       nocheck_register_item_tree_change(place, *place, mem_root);
     *place= new_value;
   }
+  /**
+    Make change in item tree after checking whether it needs registering
+
+
+    @param place         place where we should assign new value
+    @param new_value     place of the new value
+
+    @details
+    see check_and_register_item_tree_change details
+  */
+  void check_and_register_item_tree(Item **place, Item **new_value)
+  {
+    if (!stmt_arena->is_conventional())
+      check_and_register_item_tree_change(place, new_value, mem_root);
+    /*
+      We have to use memcpy instead of  *place= *new_value merge to
+      avoid problems with strict aliasing.
+    */
+    memcpy((char*) place, new_value, sizeof(*new_value));
+  }
   void nocheck_register_item_tree_change(Item **place, Item *old_value,
                                          MEM_ROOT *runtime_memroot);
+  void check_and_register_item_tree_change(Item **place, Item **new_value,
+                                           MEM_ROOT *runtime_memroot);
   void rollback_item_tree_changes();
 
   /*
@@ -2912,6 +2991,14 @@ public:
     return backup;
   }
 
+  void clear_wakeup_ready() { wakeup_ready= false; }
+  /*
+    Sleep waiting for others to wake us up with signal_wakeup_ready().
+    Must call clear_wakeup_ready() before waiting.
+  */
+  void wait_for_wakeup_ready();
+  /* Wake this thread up from wait_for_wakeup_ready(). */
+  void signal_wakeup_ready();
 private:
 
   /** The current internal error handler for this thread, or NULL. */
@@ -2941,7 +3028,7 @@ private:
     statements or default definer is set in CREATE/ALTER SP, SF, Event,
     TRIGGER or VIEW statements.
 
-    Current user will be binlogged into Query_log_event if current_user_used
+    Current user will be binlogged into Query_log_event if m_binlog_invoker
     is TRUE; It will be stored into invoker_host and invoker_user by SQL thread.
    */
   bool m_binlog_invoker;
@@ -2954,6 +3041,16 @@ private:
    */
   LEX_STRING invoker_user;
   LEX_STRING invoker_host;
+  /*
+    Flag, mutex and condition for a thread to wait for a signal from another
+    thread.
+
+    Currently used to wait for group commit to complete, can also be used for
+    other purposes.
+  */
+  bool wakeup_ready;
+  mysql_mutex_t LOCK_wakeup_ready;
+  mysql_cond_t COND_wakeup_ready;
 };
 
 
@@ -2985,6 +3082,27 @@ my_eof(THD *thd)
 
 
 /*
+  These functions are for making it later easy to add strict
+  checking for all date handling.
+*/
+
+const my_bool strict_date_checking= 0;
+
+inline ulong sql_mode_for_dates(THD *thd)
+{
+  if (strict_date_checking)
+    return (thd->variables.sql_mode &
+            (MODE_NO_ZERO_DATE | MODE_NO_ZERO_IN_DATE |
+             MODE_INVALID_DATES));
+  return (thd->variables.sql_mode & MODE_INVALID_DATES);
+}
+
+inline ulong sql_mode_for_dates()
+{
+  return sql_mode_for_dates(current_thd);
+}
+
+/*
   Used to hold information about file and file structure in exchange
   via non-DB file (...INTO OUTFILE..., ...LOAD DATA...)
   XXX: We never call destructor for objects of this class.
@@ -3032,7 +3150,11 @@ public:
   virtual uint field_count(List<Item> &fields) const
   { return fields.elements; }
   virtual bool send_result_set_metadata(List<Item> &list, uint flags)=0;
-  virtual bool send_data(List<Item> &items)=0;
+  /*
+    send_data returns 0 on ok, 1 on error and -1 if data was ignored, for
+    example for a duplicate row entry written to a temp table.
+  */
+  virtual int send_data(List<Item> &items)=0;
   virtual bool initialize_tables (JOIN *join=0) { return 0; }
   virtual void send_error(uint errcode,const char *err);
   virtual bool send_eof()=0;
@@ -3068,7 +3190,12 @@ public:
 class select_result_interceptor: public select_result
 {
 public:
-  select_result_interceptor() {}              /* Remove gcc warning */
+  select_result_interceptor()
+  {
+    DBUG_ENTER("select_result_interceptor::select_result_interceptor");
+    DBUG_PRINT("enter", ("this 0x%lx", (ulong) this));
+    DBUG_VOID_RETURN;
+  }              /* Remove gcc warning */
   uint field_count(List<Item> &fields) const { return 0; }
   bool send_result_set_metadata(List<Item> &fields, uint flag) { return FALSE; }
 };
@@ -3084,7 +3211,7 @@ class select_send :public select_result {
 public:
   select_send() :is_result_set_started(FALSE) {}
   bool send_result_set_metadata(List<Item> &list, uint flags);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool send_eof();
   virtual bool check_simple_select() const { return FALSE; }
   void abort_result_set();
@@ -3147,7 +3274,7 @@ public:
   select_export(sql_exchange *ex) :select_to_file(ex) {}
   ~select_export();
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
 };
 
 
@@ -3155,7 +3282,7 @@ class select_dump :public select_to_file {
 public:
   select_dump(sql_exchange *ex) :select_to_file(ex) {}
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
 };
 
 
@@ -3174,7 +3301,7 @@ class select_insert :public select_result_interceptor {
   ~select_insert();
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
   virtual int prepare2(void);
-  bool send_data(List<Item> &items);
+  virtual int send_data(List<Item> &items);
   virtual void store_values(List<Item> &values);
   virtual bool can_rollback_data() { return 0; }
   void send_error(uint errcode,const char *err);
@@ -3293,6 +3420,8 @@ public:
   uint  convert_blob_length;
   CHARSET_INFO *table_charset;
   bool schema_table;
+  /* TRUE if the temp table is created for subquery materialization. */
+  bool materialized_subquery;
   /*
     True if GROUP BY and its aggregate functions are already computed
     by a table access method (e.g. by loose index scan). In this case
@@ -3316,8 +3445,8 @@ public:
   TMP_TABLE_PARAM()
     :copy_field(0), group_parts(0),
      group_length(0), group_null_parts(0), convert_blob_length(0),
-     schema_table(0), precomputed_group_by(0), force_copy_fields(0),
-     bit_fields_as_long(0), skip_create_table(0)
+    schema_table(0), materialized_subquery(0), precomputed_group_by(0),
+    force_copy_fields(0), bit_fields_as_long(0), skip_create_table(0)
   {}
   ~TMP_TABLE_PARAM()
   {
@@ -3336,20 +3465,25 @@ public:
 
 class select_union :public select_result_interceptor
 {
-protected:
+public:
   TMP_TABLE_PARAM tmp_table_param;
+  int write_err; /* Error code from the last send_data->ha_write_row call. */
 public:
   TABLE *table;
+  ha_rows records;
 
-  select_union() :table(0) { tmp_table_param.init(); }
+  select_union() :write_err(0), table(0), records(0) { tmp_table_param.init(); }
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool send_eof();
   bool flush();
   void cleanup();
   virtual bool create_result_table(THD *thd, List<Item> *column_types,
                                    bool is_distinct, ulonglong options,
-                                   const char *alias, bool bit_fields_as_long);
+                                   const char *alias, 
+                                   bool bit_fields_as_long,
+                                   bool create_table);
+  TMP_TABLE_PARAM *get_tmp_table_param() { return &tmp_table_param; }
 };
 
 /* Base subselect interface class */
@@ -3359,7 +3493,7 @@ protected:
   Item_subselect *item;
 public:
   select_subselect(Item_subselect *item);
-  bool send_data(List<Item> &items)=0;
+  int send_data(List<Item> &items)=0;
   bool send_eof() { return 0; };
 };
 
@@ -3370,7 +3504,7 @@ public:
   select_singlerow_subselect(Item_subselect *item_arg)
     :select_subselect(item_arg)
   {}
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
 };
 
 
@@ -3401,7 +3535,7 @@ protected:
     The number of columns in the biggest sub-row that consists of only
     NULL values.
   */
-  ha_rows max_nulls_in_row;
+  uint max_nulls_in_row;
   /*
     Count of rows writtent to the temp table. This is redundant as it is
     already stored in handler::stats.records, however that one is relatively
@@ -3413,12 +3547,14 @@ protected:
   void reset();
 
 public:
-  select_materialize_with_stats() {}
-  virtual bool create_result_table(THD *thd, List<Item> *column_types,
-                                   bool is_distinct, ulonglong options,
-                                   const char *alias, bool bit_fields_as_long);
+  select_materialize_with_stats() { tmp_table_param.init(); }
+  bool create_result_table(THD *thd, List<Item> *column_types,
+                           bool is_distinct, ulonglong options,
+                           const char *alias, 
+                           bool bit_fields_as_long,
+                           bool create_table);
   bool init_result_table(ulonglong select_options);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   void cleanup();
   ha_rows get_null_count_of_col(uint idx)
   {
@@ -3435,7 +3571,7 @@ public:
     DBUG_ASSERT(idx < table->s->fields);
     return col_stat[idx].min_null_row;
   }
-  ha_rows get_max_nulls_in_row() { return max_nulls_in_row; }
+  uint get_max_nulls_in_row() { return max_nulls_in_row; }
 };
 
 
@@ -3450,7 +3586,7 @@ public:
     :select_subselect(item_arg), cache(0), fmax(mx)
   {}
   void cleanup();
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool cmp_real();
   bool cmp_int();
   bool cmp_decimal();
@@ -3463,7 +3599,7 @@ class select_exists_subselect :public select_subselect
 public:
   select_exists_subselect(Item_subselect *item_arg)
     :select_subselect(item_arg){}
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
 };
 
 
@@ -3610,6 +3746,7 @@ class user_var_entry
   DTCollation collation;
 };
 
+
 /*
    Unique -- class for unique (removing of duplicates).
    Puts all values to the TREE. If the tree becomes too big,
@@ -3626,28 +3763,44 @@ class Unique :public Sql_alloc
   IO_CACHE file;
   TREE tree;
   uchar *record_pointers;
+  ulong filtered_out_elems;
   bool flush();
   uint size;
+  uint full_size;
+  uint min_dupl_count;   /* always 0 for unions, > 0 for intersections */
 
 public:
   ulong elements;
   Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg,
-	 uint size_arg, ulonglong max_in_memory_size_arg);
+	 uint size_arg, ulonglong max_in_memory_size_arg,
+         uint min_dupl_count_arg= 0);
   ~Unique();
   ulong elements_in_tree() { return tree.elements_in_tree; }
   inline bool unique_add(void *ptr)
   {
     DBUG_ENTER("unique_add");
     DBUG_PRINT("info", ("tree %u - %lu", tree.elements_in_tree, max_elements));
-    if (tree.elements_in_tree > max_elements && flush())
+    if (!(tree.flag & TREE_ONLY_DUPS) && 
+        tree.elements_in_tree >= max_elements && flush())
       DBUG_RETURN(1);
     DBUG_RETURN(!tree_insert(&tree, ptr, 0, tree.custom_arg));
   }
 
+  bool is_in_memory() { return (my_b_tell(&file) == 0); }
+  void close_for_expansion() { tree.flag= TREE_ONLY_DUPS; }
+
   bool get(TABLE *table);
-  static double get_use_cost(uint *buffer, uint nkeys, uint key_size,
-                             ulonglong max_in_memory_size);
-  inline static int get_cost_calc_buff_size(ulong nkeys, uint key_size,
+  
+  /* Cost of searching for an element in the tree */
+  inline static double get_search_cost(ulonglong tree_elems, uint compare_factor)
+  {
+    return log((double) tree_elems) / (compare_factor * M_LN2);
+  }  
+
+  static double get_use_cost(uint *buffer, size_t nkeys, uint key_size,
+                             ulonglong max_in_memory_size, uint compare_factor,
+                             bool intersect_fl, bool *in_memory);
+  inline static int get_cost_calc_buff_size(size_t nkeys, uint key_size,
                                             ulonglong max_in_memory_size)
   {
     register ulonglong max_elems_in_tree=
@@ -3663,6 +3816,11 @@ public:
 
   friend int unique_write_to_file(uchar* key, element_count count, Unique *unique);
   friend int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique);
+
+  friend int unique_write_to_file_with_count(uchar* key, element_count count,
+                                             Unique *unique);
+  friend int unique_intersect_write_to_ptrs(uchar* key, element_count count, 
+				            Unique *unique);
 };
 
 
@@ -3689,7 +3847,7 @@ public:
   multi_delete(TABLE_LIST *dt, uint num_of_tables);
   ~multi_delete();
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool initialize_tables (JOIN *join);
   void send_error(uint errcode,const char *err);
   int do_deletes();
@@ -3706,7 +3864,7 @@ public:
 class multi_update :public select_result_interceptor
 {
   TABLE_LIST *all_tables; /* query/update command tables */
-  TABLE_LIST *leaves;     /* list of leves of join table tree */
+  List<TABLE_LIST> *leaves;     /* list of leves of join table tree */
   TABLE_LIST *update_tables, *table_being_updated;
   TABLE **tmp_tables, *main_table, *table_to_update;
   TMP_TABLE_PARAM *tmp_table_param;
@@ -3732,12 +3890,12 @@ class multi_update :public select_result_interceptor
   bool error_handled;
 
 public:
-  multi_update(TABLE_LIST *ut, TABLE_LIST *leaves_list,
+  multi_update(TABLE_LIST *ut, List<TABLE_LIST> *leaves_list,
 	       List<Item> *fields, List<Item> *values,
 	       enum_duplicates handle_duplicates, bool ignore);
   ~multi_update();
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool initialize_tables (JOIN *join);
   void send_error(uint errcode,const char *err);
   int  do_updates();
@@ -3779,7 +3937,7 @@ public:
   select_dumpvar()  { var_list.empty(); row_count= 0;}
   ~select_dumpvar() {}
   int prepare(List<Item> &list, SELECT_LEX_UNIT *u);
-  bool send_data(List<Item> &items);
+  int send_data(List<Item> &items);
   bool send_eof();
   virtual bool check_simple_select() const;
   void cleanup();
@@ -3788,10 +3946,11 @@ public:
 /* Bits in sql_command_flags */
 
 #define CF_CHANGES_DATA           (1U << 0)
-/* The 2nd bit is unused -- it used to be CF_HAS_ROW_COUNT. */
+#define CF_REPORT_PROGRESS        (1U << 1)
 #define CF_STATUS_COMMAND         (1U << 2)
 #define CF_SHOW_TABLE_COMMAND     (1U << 3)
 #define CF_WRITE_LOGS_COMMAND     (1U << 4)
+
 /**
   Must be set for SQL statements that may contain
   Item expressions and/or use joins and tables.
@@ -3932,17 +4091,25 @@ inline int handler::ha_index_read_map(uchar * buf, const uchar * key,
   return error;
 }
 
+
+/*
+  @note: Other index lookup/navigation functions require prior
+  handler->index_init() call. This function is different, it requires
+  that the scan is not initialized, and accepts "uint index" as an argument.
+*/
+
 inline int handler::ha_index_read_idx_map(uchar * buf, uint index,
                                           const uchar * key,
                                           key_part_map keypart_map,
                                           enum ha_rkey_function find_flag)
 {
+  DBUG_ASSERT(inited==NONE);
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   increment_statistics(&SSV::ha_read_key_count);
   int error= index_read_idx_map(buf, index, key, keypart_map, find_flag);
   if (!error)
   {
-    rows_read++;
+    update_rows_read();
     index_rows_read[index]++;
   }
   table->status=error ? STATUS_NOT_FOUND: 0;
@@ -4020,7 +4187,8 @@ inline int handler::ha_ft_read(uchar *buf)
 {
   int error= ft_read(buf);
   if (!error)
-    rows_read++;
+    update_rows_read();
+
   table->status=error ? STATUS_NOT_FOUND: 0;
   return error;
 }
@@ -4031,7 +4199,7 @@ inline int handler::ha_rnd_next(uchar *buf)
   increment_statistics(&SSV::ha_read_rnd_next_count);
   int error= rnd_next(buf);
   if (!error)
-    rows_read++;
+    update_rows_read();
   table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_READ_ROW_DONE(error);
   return error;
@@ -4043,7 +4211,7 @@ inline int handler::ha_rnd_pos(uchar *buf, uchar *pos)
   increment_statistics(&SSV::ha_read_rnd_count);
   int error= rnd_pos(buf, pos);
   if (!error)
-    rows_read++;
+    update_rows_read();
   table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_READ_ROW_DONE(error);
   return error;
@@ -4053,7 +4221,7 @@ inline int handler::ha_rnd_pos_by_record(uchar *buf)
 {
   int error= rnd_pos_by_record(buf);
   if (!error)
-    rows_read++;
+    update_rows_read();
   table->status=error ? STATUS_NOT_FOUND: 0;
   return error;
 }
@@ -4062,11 +4230,29 @@ inline int handler::ha_read_first_row(uchar *buf, uint primary_key)
 {
   int error= read_first_row(buf, primary_key);
   if (!error)
-    rows_read++;
+    update_rows_read();
   table->status=error ? STATUS_NOT_FOUND: 0;
   return error;
 }
 
+inline int handler::ha_write_tmp_row(uchar *buf)
+{
+  MYSQL_INSERT_ROW_START(table_share->db.str, table_share->table_name.str);
+  increment_statistics(&SSV::ha_tmp_write_count);
+  int error= write_row(buf);
+  MYSQL_INSERT_ROW_DONE(error);
+  return error;
+}
+
+inline int handler::ha_update_tmp_row(const uchar *old_data, uchar *new_data)
+{
+  MYSQL_UPDATE_ROW_START(table_share->db.str, table_share->table_name.str);
+  increment_statistics(&SSV::ha_tmp_update_count);
+  int error= update_row(old_data, new_data);
+  MYSQL_UPDATE_ROW_DONE(error);
+  return error;
+}
+
 #endif /* MYSQL_SERVER */
 
 #endif /* SQL_CLASS_INCLUDED */
diff --git a/sql/sql_connect.cc b/sql/sql_connect.cc
index 3e7c7344a10..b2e37758d5b 100644
--- a/sql/sql_connect.cc
+++ b/sql/sql_connect.cc
@@ -60,6 +60,7 @@ int get_or_create_user_conn(THD *thd, const char *user,
 
   DBUG_ASSERT(user != 0);
   DBUG_ASSERT(host != 0);
+  DBUG_ASSERT(thd->user_connect == 0);
 
   user_len= strlen(user);
   temp_len= (strmov(strmov(temp_user, user)+1, host) - temp_user)+1;
@@ -119,7 +120,7 @@ end:
 
 int check_for_max_user_connections(THD *thd, USER_CONN *uc)
 {
-  int error=0;
+  int error= 1;
   DBUG_ENTER("check_for_max_user_connections");
 
   mysql_mutex_lock(&LOCK_user_conn);
@@ -128,7 +129,6 @@ int check_for_max_user_connections(THD *thd, USER_CONN *uc)
       global_system_variables.max_user_connections < (uint) uc->connections)
   {
     my_error(ER_TOO_MANY_USER_CONNECTIONS, MYF(0), uc->user);
-    error=1;
     goto end;
   }
   time_out_user_resource_limits(thd, uc);
@@ -138,7 +138,6 @@ int check_for_max_user_connections(THD *thd, USER_CONN *uc)
     my_error(ER_USER_LIMIT_REACHED, MYF(0), uc->user,
              "max_user_connections",
              (long) uc->user_resources.user_conn);
-    error= 1;
     goto end;
   }
   if (uc->user_resources.conn_per_hour &&
@@ -147,10 +146,10 @@ int check_for_max_user_connections(THD *thd, USER_CONN *uc)
     my_error(ER_USER_LIMIT_REACHED, MYF(0), uc->user,
              "max_connections_per_hour",
              (long) uc->user_resources.conn_per_hour);
-    error=1;
     goto end;
   }
   uc->conn_per_hour++;
+  error= 0;
 
 end:
   if (error)
@@ -709,6 +708,7 @@ static void update_global_user_stats_with_user(THD *thd,
   user_stats->binlog_bytes_written+=
     (thd->status_var.binlog_bytes_written -
      thd->org_status_var.binlog_bytes_written);
+  /* We are not counting rows in internal temporary tables here ! */
   user_stats->rows_read+=      (thd->status_var.rows_read -
                                 thd->org_status_var.rows_read);
   user_stats->rows_sent+=      (thd->status_var.rows_sent -
@@ -1044,8 +1044,14 @@ void end_connection(THD *thd)
 {
   NET *net= &thd->net;
   plugin_thdvar_cleanup(thd);
+
   if (thd->user_connect)
   {
+    /*
+      We decrease this variable early to make it easy to log again quickly.
+      This code is not critical as we will in any case do this test
+      again in thd->cleanup()
+    */
     decrease_user_connections(thd->user_connect);
     /*
       The thread may returned back to the pool and assigned to a user
@@ -1175,7 +1181,7 @@ void do_handle_one_connection(THD *thd_arg)
 {
   THD *thd= thd_arg;
 
-  thd->thr_create_utime= my_micro_time();
+  thd->thr_create_utime= microsecond_interval_timer();
 
   if (MYSQL_CALLBACK_ELSE(thread_scheduler, init_new_connection_thread, (), 0))
   {
diff --git a/sql/sql_connect.h b/sql/sql_connect.h
index 6faf595bd17..32d6af72a92 100644
--- a/sql/sql_connect.h
+++ b/sql/sql_connect.h
@@ -42,7 +42,11 @@ bool init_new_connection_handler_thread();
 void reset_mqh(LEX_USER *lu, bool get_them);
 bool check_mqh(THD *thd, uint check_command);
 void time_out_user_resource_limits(THD *thd, USER_CONN *uc);
+#ifndef NO_EMBEDDED_ACCESS_CHECKS
 void decrease_user_connections(USER_CONN *uc);
+#else
+#define decrease_user_connections(X) do { } while(0)       /* nothing */
+#endif
 bool thd_init_client_charset(THD *thd, uint cs_number);
 bool setup_connection_thread_globals(THD *thd);
 bool thd_prepare_connection(THD *thd);
diff --git a/sql/sql_const.h b/sql/sql_const.h
index d08a8f18308..1b2580a2680 100644
--- a/sql/sql_const.h
+++ b/sql/sql_const.h
@@ -51,10 +51,13 @@
 #define MAX_BIT_FIELD_LENGTH    64      /* Max length in bits for bit fields */
 
 #define MAX_DATE_WIDTH		10	/* YYYY-MM-DD */
-#define MAX_TIME_WIDTH		23	/* -DDDDDD HH:MM:SS.###### */
+#define MIN_TIME_WIDTH          10      /* -HHH:MM:SS */
+#define MAX_TIME_WIDTH          16      /* -DDDDDD HH:MM:SS */
+#define MAX_TIME_FULL_WIDTH     23      /* -DDDDDD HH:MM:SS.###### */
 #define MAX_DATETIME_FULL_WIDTH 29	/* YYYY-MM-DD HH:MM:SS.###### AM */
 #define MAX_DATETIME_WIDTH	19	/* YYYY-MM-DD HH:MM:SS */
 #define MAX_DATETIME_COMPRESSED_WIDTH 14  /* YYYYMMDDHHMMSS */
+#define MAX_DATETIME_PRECISION  6
 
 #define MAX_TABLES	(sizeof(table_map)*8-3)	/* Max tables in join */
 #define PARAM_TABLE_BIT	(((table_map) 1) << (sizeof(table_map)*8-3))
@@ -68,7 +71,7 @@
 #define MAX_SELECT_NESTING (sizeof(nesting_map)*8-1)
 
 #define MAX_SORT_MEMORY 2048*1024
-#define MIN_SORT_MEMORY 32*1024
+#define MIN_SORT_MEMORY 1024
 
 /* Some portable defines */
 
@@ -167,7 +170,10 @@
   Number of comparisons of table rowids equivalent to reading one row from a 
   table.
 */
-#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*2)
+#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*100)
+
+/* cost1 is better that cost2 only if cost1 + COST_EPS < cost2 */
+#define COST_EPS  0.001
 
 /*
   For sequential disk seeks the cost formula is:
@@ -225,7 +231,6 @@
 #define DELAYED_LIMIT		100		/**< pause after xxx inserts */
 #define DELAYED_QUEUE_SIZE	1000
 #define DELAYED_WAIT_TIMEOUT	5*60		/**< Wait for delayed insert */
-#define FLUSH_TIME		0		/**< Don't flush tables */
 #define MAX_CONNECT_ERRORS	10		///< errors before disabling host
 
 #define LONG_TIMEOUT ((ulong) 3600L*24L*365L)
@@ -237,8 +242,6 @@
 #define MAX_TIME_ZONE_NAME_LENGTH       (NAME_LEN + 1)
 
 #if defined(__WIN__)
-#undef	FLUSH_TIME
-#define FLUSH_TIME	1800			/**< Flush every half hour */
 
 #define INTERRUPT_PRIOR -2
 #define CONNECT_PRIOR	-1
diff --git a/sql/sql_cursor.cc b/sql/sql_cursor.cc
index 96a2d50538b..7350618adc0 100644
--- a/sql/sql_cursor.cc
+++ b/sql/sql_cursor.cc
@@ -329,7 +329,7 @@ void Materialized_cursor::fetch(ulong num_rows)
       If network write failed (i.e. due to a closed socked),
       the error has already been set. Just return.
     */
-    if (result->send_data(item_list))
+    if (result->send_data(item_list) > 0)
       return;
   }
 
@@ -386,7 +386,7 @@ bool Select_materialize::send_result_set_metadata(List<Item> &list, uint flags)
   if (create_result_table(unit->thd, unit->get_unit_column_types(),
                           FALSE,
                           thd->variables.option_bits | TMP_TABLE_ALL_COLUMNS,
-                          "", FALSE))
+                          "", FALSE, TRUE))
     return TRUE;
 
   materialized_cursor= new (&table->mem_root)
diff --git a/sql/sql_db.cc b/sql/sql_db.cc
index 56dd266acb9..96ca4c10390 100644
--- a/sql/sql_db.cc
+++ b/sql/sql_db.cc
@@ -33,6 +33,7 @@
 #include <mysys_err.h>
 #include "sp.h"
 #include "events.h"
+#include "sql_handler.h"
 #include <my_dir.h>
 #include <m_ctype.h>
 #include "log.h"
@@ -858,7 +859,7 @@ bool mysql_rm_db(THD *thd,char *db,bool if_exists, bool silent)
 
     ha_drop_database(path);
     tmp_disable_binlog(thd);
-    query_cache_invalidate1(db);
+    query_cache_invalidate1(thd, db);
     (void) sp_drop_db_routines(thd, db); /* @todo Do not ignore errors */
 #ifdef HAVE_EVENT_SCHEDULER
     Events::drop_schema_events(thd, db);
diff --git a/sql/sql_delete.cc b/sql/sql_delete.cc
index 3a7cc38b097..9cc63f7b33d 100644
--- a/sql/sql_delete.cc
+++ b/sql/sql_delete.cc
@@ -36,8 +36,8 @@
 #include "sql_trigger.h"
 #include "transaction.h"
 #include "records.h"                            // init_read_record,
+#include "sql_derived.h"                        // mysql_handle_list_of_derived
                                                 // end_read_record
-
 /**
   Implement DELETE SQL word.
 
@@ -69,10 +69,21 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds,
 
   if (open_and_lock_tables(thd, table_list, TRUE, 0))
     DBUG_RETURN(TRUE);
-  if (!(table= table_list->table))
+
+  if (mysql_handle_list_of_derived(thd->lex, table_list, DT_MERGE_FOR_INSERT))
+    DBUG_RETURN(TRUE);
+  if (mysql_handle_list_of_derived(thd->lex, table_list, DT_PREPARE))
+    DBUG_RETURN(TRUE);
+
+  if (!table_list->updatable)
+  {
+     my_error(ER_NON_UPDATABLE_TABLE, MYF(0), table_list->alias, "DELETE");
+     DBUG_RETURN(TRUE);
+  }
+  if (!(table= table_list->table) || !table->created)
   {
-    my_error(ER_VIEW_DELETE_MERGE_VIEW, MYF(0),
-	     table_list->view_db.str, table_list->view_name.str);
+      my_error(ER_VIEW_DELETE_MERGE_VIEW, MYF(0),
+	       table_list->view_db.str, table_list->view_name.str);
     DBUG_RETURN(TRUE);
   }
   thd_proc_info(thd, "init");
@@ -81,6 +92,11 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds,
   if (mysql_prepare_delete(thd, table_list, &conds))
     DBUG_RETURN(TRUE);
 
+  if (thd->lex->current_select->first_cond_optimization)
+  {
+    thd->lex->current_select->save_leaf_tables(thd);
+    thd->lex->current_select->first_cond_optimization= 0;
+  }
   /* check ORDER BY even if it can be ignored */
   if (order)
   {
@@ -102,6 +118,10 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds,
     }
   }
 
+  /* Apply the IN=>EXISTS transformation to all subqueries and optimize them. */
+  if (select_lex->optimize_unflattened_subqueries())
+    DBUG_RETURN(TRUE);
+
   const_cond= (!conds || conds->const_item());
   safe_update=test(thd->variables.option_bits & OPTION_SAFE_UPDATES);
   if (safe_update && const_cond)
@@ -382,6 +402,12 @@ cleanup:
     query_cache_invalidate3(thd, table_list, 1);
   }
 
+  if (thd->lex->current_select->first_cond_optimization)
+  {
+    thd->lex->current_select->save_leaf_tables(thd);
+    thd->lex->current_select->first_cond_optimization= 0;
+  }
+
   delete select;
   transactional_table= table->file->has_transactions();
 
@@ -453,8 +479,8 @@ int mysql_prepare_delete(THD *thd, TABLE_LIST *table_list, Item **conds)
   if (setup_tables_and_check_access(thd, &thd->lex->select_lex.context,
                                     &thd->lex->select_lex.top_join_list,
                                     table_list, 
-                                    &select_lex->leaf_tables, FALSE, 
-                                    DELETE_ACL, SELECT_ACL) ||
+                                    select_lex->leaf_tables, FALSE, 
+                                    DELETE_ACL, SELECT_ACL, TRUE) ||
       setup_conds(thd, table_list, select_lex->leaf_tables, conds) ||
       setup_ftfuncs(select_lex))
     DBUG_RETURN(TRUE);
@@ -476,7 +502,7 @@ int mysql_prepare_delete(THD *thd, TABLE_LIST *table_list, Item **conds)
     fix_inner_refs(thd, all_fields, select_lex, select_lex->ref_pointer_array))
     DBUG_RETURN(TRUE);
 
-  select_lex->fix_prepare_information(thd, conds, &fake_conds);
+  select_lex->fix_prepare_information(thd, conds, &fake_conds); 
   DBUG_RETURN(FALSE);
 }
 
@@ -512,6 +538,12 @@ int mysql_multi_delete_prepare(THD *thd)
   TABLE_LIST *target_tbl;
   DBUG_ENTER("mysql_multi_delete_prepare");
 
+  if (mysql_handle_derived(lex, DT_INIT))
+    DBUG_RETURN(TRUE);
+  if (mysql_handle_derived(lex, DT_MERGE_FOR_INSERT))
+    DBUG_RETURN(TRUE);
+  if (mysql_handle_derived(lex, DT_PREPARE))
+    DBUG_RETURN(TRUE);
   /*
     setup_tables() need for VIEWs. JOIN::prepare() will not do it second
     time.
@@ -521,10 +553,12 @@ int mysql_multi_delete_prepare(THD *thd)
   if (setup_tables_and_check_access(thd, &thd->lex->select_lex.context,
                                     &thd->lex->select_lex.top_join_list,
                                     lex->query_tables,
-                                    &lex->select_lex.leaf_tables, FALSE, 
-                                    DELETE_ACL, SELECT_ACL))
+                                    lex->select_lex.leaf_tables, FALSE, 
+                                    DELETE_ACL, SELECT_ACL, FALSE))
     DBUG_RETURN(TRUE);
 
+  if (lex->select_lex.handle_derived(thd->lex, DT_MERGE))  
+    DBUG_RETURN(TRUE);
 
   /*
     Multi-delete can't be constructed over-union => we always have
@@ -536,14 +570,14 @@ int mysql_multi_delete_prepare(THD *thd)
        target_tbl;
        target_tbl= target_tbl->next_local)
   {
-    if (!(target_tbl->table= target_tbl->correspondent_table->table))
+
+    target_tbl->table= target_tbl->correspondent_table->table;
+    if (target_tbl->correspondent_table->is_multitable())
     {
-      DBUG_ASSERT(target_tbl->correspondent_table->view &&
-                  target_tbl->correspondent_table->multitable_view);
-      my_error(ER_VIEW_DELETE_MERGE_VIEW, MYF(0),
-               target_tbl->correspondent_table->view_db.str,
-               target_tbl->correspondent_table->view_name.str);
-      DBUG_RETURN(TRUE);
+       my_error(ER_VIEW_DELETE_MERGE_VIEW, MYF(0),
+                target_tbl->correspondent_table->view_db.str,
+                target_tbl->correspondent_table->view_name.str);
+       DBUG_RETURN(TRUE);
     }
 
     if (!target_tbl->correspondent_table->updatable ||
@@ -573,6 +607,10 @@ int mysql_multi_delete_prepare(THD *thd)
     with further calls to unique_table
   */
   lex->select_lex.exclude_from_table_unique_test= FALSE;
+  
+  if (lex->select_lex.save_prep_leaf_tables(thd))
+    DBUG_RETURN(TRUE);
+  
   DBUG_RETURN(FALSE);
 }
 
@@ -593,6 +631,12 @@ multi_delete::prepare(List<Item> &values, SELECT_LEX_UNIT *u)
   unit= u;
   do_delete= 1;
   thd_proc_info(thd, "deleting from main table");
+  SELECT_LEX *select_lex= u->first_select();
+  if (select_lex->first_cond_optimization)
+  {
+    if (select_lex->handle_derived(thd->lex, DT_MERGE))
+      DBUG_RETURN(TRUE);
+  }
   DBUG_RETURN(0);
 }
 
@@ -626,9 +670,10 @@ multi_delete::initialize_tables(JOIN *join)
 
 
   walk= delete_tables;
-  for (JOIN_TAB *tab=join->join_tab, *end=join->join_tab+join->tables;
-       tab < end;
-       tab++)
+
+  for (JOIN_TAB *tab= first_linear_tab(join, WITH_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
   {
     if (tab->table->map & tables_to_delete_from)
     {
@@ -707,7 +752,7 @@ multi_delete::~multi_delete()
 }
 
 
-bool multi_delete::send_data(List<Item> &values)
+int multi_delete::send_data(List<Item> &values)
 {
   int secure_counter= delete_while_scanning ? -1 : 0;
   TABLE_LIST *del_table;
diff --git a/sql/sql_derived.cc b/sql/sql_derived.cc
index 0c7ffb48935..03cbf38d4ef 100644
--- a/sql/sql_derived.cc
+++ b/sql/sql_derived.cc
@@ -25,40 +25,85 @@
 #include "unireg.h"
 #include "sql_derived.h"
 #include "sql_select.h"
+#include "sql_base.h"
 #include "sql_view.h"                         // check_duplicate_names
 #include "sql_acl.h"                          // SELECT_ACL
 
+typedef bool (*dt_processor)(THD *thd, LEX *lex, TABLE_LIST *derived);
+
+bool mysql_derived_init(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_optimize(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_merge(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_create(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_fill(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_reinit(THD *thd, LEX *lex, TABLE_LIST *derived);
+bool mysql_derived_merge_for_insert(THD *thd, LEX *lex, TABLE_LIST *derived);
+
+
+dt_processor processors[]=
+{
+  &mysql_derived_init,
+  &mysql_derived_prepare,
+  &mysql_derived_optimize,
+  &mysql_derived_merge,
+  &mysql_derived_merge_for_insert,
+  &mysql_derived_create,
+  &mysql_derived_fill,
+  &mysql_derived_reinit,
+};
 
 /*
-  Call given derived table processor (preparing or filling tables)
+  Run specified phases on all derived tables/views in given LEX.
 
-  SYNOPSIS
-    mysql_handle_derived()
-    lex                 LEX for this thread
-    processor           procedure of derived table processing
+  @param lex              LEX for this thread
+  @param phases           phases to run derived tables/views through
 
-  RETURN
-    FALSE  OK
-    TRUE   Error
+  @return FALSE  OK
+  @return TRUE   Error
 */
-
 bool
-mysql_handle_derived(LEX *lex, bool (*processor)(THD*, LEX*, TABLE_LIST*))
+mysql_handle_derived(LEX *lex, uint phases)
 {
   bool res= FALSE;
-  if (lex->derived_tables)
+  THD *thd= lex->thd;
+  if (!lex->derived_tables)
+    return FALSE;
+
+  lex->thd->derived_tables_processing= TRUE;
+
+  for (uint phase= 0; phase < DT_PHASES && !res; phase++)
   {
-    lex->thd->derived_tables_processing= TRUE;
+    uint phase_flag= DT_INIT << phase;
+    if (phase_flag > phases)
+      break;
+    if (!(phases & phase_flag))
+      continue;
+    if (phase_flag >= DT_CREATE && !thd->fill_derived_tables())
+      break;
+
     for (SELECT_LEX *sl= lex->all_selects_list;
-	 sl;
+	 sl && !res;
 	 sl= sl->next_select_in_list())
     {
       for (TABLE_LIST *cursor= sl->get_table_list();
-	   cursor;
+	   cursor && !res;
 	   cursor= cursor->next_local)
       {
-	if ((res= (*processor)(lex->thd, lex, cursor)))
-	  goto out;
+        if (!cursor->is_view_or_derived() && phases == DT_MERGE_FOR_INSERT)
+          continue;
+        uint8 allowed_phases= (cursor->is_merged_derived() ? DT_PHASES_MERGE :
+                               DT_PHASES_MATERIALIZE | DT_MERGE_FOR_INSERT);
+        /*
+          Skip derived tables to which the phase isn't applicable.
+          TODO: mark derived at the parse time, later set it's type
+          (merged or materialized)
+        */
+        if ((phase_flag != DT_PREPARE && !(allowed_phases & phase_flag)) ||
+            (cursor->merged_for_insert && phase_flag != DT_REINIT &&
+             phase_flag != DT_PREPARE))
+          continue;
+	res= (*processors[phase])(lex->thd, lex, cursor);
       }
       if (lex->describe)
       {
@@ -71,30 +116,435 @@ mysql_handle_derived(LEX *lex, bool (*processor)(THD*, LEX*, TABLE_LIST*))
       }
     }
   }
-out:
+  lex->thd->derived_tables_processing= FALSE;
+  return res;
+}
+
+/*
+  Run through phases for the given derived table/view.
+
+  @param lex             LEX for this thread
+  @param derived         the derived table to handle
+  @param phase_map       phases to process tables/views through
+
+  @details
+
+  This function process the derived table (view) 'derived' to performs all
+  actions that are to be done on the table at the phases specified by
+  phase_map. The processing is carried out starting from the actions
+  performed at the earlier phases (those having smaller ordinal numbers).
+
+  @note
+  This function runs specified phases of the derived tables handling on the
+  given derived table/view. This function is used in the chain of calls:
+    SELECT_LEX::handle_derived ->
+      TABLE_LIST::handle_derived ->
+        mysql_handle_single_derived
+  This chain of calls implements the bottom-up handling of the derived tables:
+  i.e. most inner derived tables/views are handled first. This order is
+  required for the all phases except the merge and the create steps.
+  For the sake of code simplicity this order is kept for all phases.
+
+  @return FALSE ok
+  @return TRUE  error
+*/
+
+bool
+mysql_handle_single_derived(LEX *lex, TABLE_LIST *derived, uint phases)
+{
+  bool res= FALSE;
+  THD *thd= lex->thd;
+  uint8 allowed_phases= (derived->is_merged_derived() ? DT_PHASES_MERGE :
+                         DT_PHASES_MATERIALIZE);
+  if (!lex->derived_tables)
+    return FALSE;
+
+  lex->thd->derived_tables_processing= TRUE;
+
+  for (uint phase= 0; phase < DT_PHASES; phase++)
+  {
+    uint phase_flag= DT_INIT << phase;
+    if (phase_flag > phases)
+      break;
+    if (!(phases & phase_flag))
+      continue;
+    /* Skip derived tables to which the phase isn't applicable.  */
+    if (phase_flag != DT_PREPARE &&
+        !(allowed_phases & phase_flag))
+      continue;
+    if (phase_flag >= DT_CREATE && !thd->fill_derived_tables())
+      break;
+
+    if ((res= (*processors[phase])(lex->thd, lex, derived)))
+      break;
+  }
   lex->thd->derived_tables_processing= FALSE;
   return res;
 }
 
 
 /**
-  @brief Create temporary table structure (but do not fill it).
+  Run specified phases for derived tables/views in the given list
 
-  @param thd Thread handle
-  @param lex LEX for this thread
-  @param orig_table_list TABLE_LIST for the upper SELECT
+  @param lex        LEX for this thread
+  @param table_list list of derived tables/view to handle
+  @param phase_map  phases to process tables/views through
 
-  @details 
+  @details
+  This function runs phases specified by the 'phases_map' on derived
+  tables/views found in the 'dt_list' with help of the
+  TABLE_LIST::handle_derived function.
+  'lex' is passed as an argument to the TABLE_LIST::handle_derived.
 
-  This function is called before any command containing derived tables is
-  executed. Currently the function is used for derived tables, i.e.
+  @return FALSE ok
+  @return TRUE  error
+*/
 
-  - Anonymous derived tables, or 
-  - Named derived tables (aka views) with the @c TEMPTABLE algorithm.
-   
-  The table reference, contained in @c orig_table_list, is updated with the
-  fields of a new temporary table.
+bool
+mysql_handle_list_of_derived(LEX *lex, TABLE_LIST *table_list, uint phases)
+{
+  for (TABLE_LIST *tl= table_list; tl; tl= tl->next_local)
+  {
+    if (tl->is_view_or_derived() &&
+        tl->handle_derived(lex, phases))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/**
+  Merge a derived table/view into the embedding select
+
+  @param thd     thread handle
+  @param lex     LEX of the embedding query.
+  @param derived reference to the derived table.
+
+  @details
+  This function merges the given derived table / view into the parent select
+  construction. Any derived table/reference to view occurred in the FROM
+  clause of the embedding select is represented by a TABLE_LIST structure a
+  pointer to which is passed to the function as in the parameter 'derived'.
+  This structure contains  the number/map, alias, a link to SELECT_LEX of the
+  derived table and other info. If the 'derived' table is used in a nested join
+  then additionally the structure contains a reference to the ON expression
+  for this join.
+
+  The merge process results in elimination of the derived table (or the
+  reference to a view) such that:
+    - the FROM list of the derived table/view is wrapped into a nested join
+      after which the nest is added to the FROM list of the embedding select
+    - the WHERE condition of the derived table (view) is ANDed with the ON
+      condition attached to the table.
+
+  @note
+  Tables are merged into the leaf_tables list, original derived table is removed
+  from this list also. SELECT_LEX::table_list list is left untouched.
+  Where expression is merged with derived table's on_expr and can be found after
+  the merge through the SELECT_LEX::table_list.
+
+  Examples of the derived table/view merge:
+
+  Schema:
+  Tables: t1(f1), t2(f2), t3(f3)
+  View v1: SELECT f1 FROM t1 WHERE f1 < 1
+
+  Example with a view:
+    Before merge:
+
+    The query (Q1): SELECT f1,f2 FROM t2 LEFT JOIN v1 ON f1 = f2
+
+       (LEX of the main query)
+                 |
+           (select_lex)
+                 |
+         (FROM table list)
+                 |
+            (join list)= t2, v1
+                             / \
+                            /  (on_expr)= (f1 = f2)
+                            |
+                    (LEX of the v1 view)
+                            |
+                       (select_lex)= SELECT f1 FROM t1 WHERE f1 < 1
+
+
+    After merge:
+
+    The rewritten query Q1 (Q1'):
+      SELECT f1,f2 FROM t2 LEFT JOIN (t1) ON ((f1 = f2) and (f1 < 1))
+
+        (LEX of the main query)
+                   |
+             (select_lex)
+                   |
+           (FROM table list)
+                   |
+               (join list)= t2, (t1)
+                                    \
+                                   (on_expr)= (f1 = f2) and (f1 < 1)
+
+    In this example table numbers are assigned as follows:
+      (outer select): t2 - 1, v1 - 2
+      (inner select): t1 - 1
+    After the merge table numbers will be:
+      (outer select): t2 - 1, t1 - 2
+
+  Example with a derived table:
+    The query Q2:
+      SELECT f1,f2
+       FROM (SELECT f1 FROM t1, t3 WHERE f1=f3 and f1 < 1) tt, t2
+       WHERE f1 = f2
+
+    Before merge:
+              (LEX of the main query)
+                        |
+                  (select_lex)
+                  /           \
+       (FROM table list)   (WHERE clause)= (f1 = f2)
+                  |
+           (join list)= tt, t2
+                       / \
+                      /  (on_expr)= (empty)
+                     /
+           (select_lex)= SELECT f1 FROM t1, t3 WHERE f1 = f3 and f1 < 1
+
+    After merge:
+
+    The rewritten query Q2 (Q2'):
+      SELECT f1,f2
+       FROM (t1, t3) JOIN t2 ON (f1 = f3 and f1 < 1)
+       WHERE f1 = f2
+
+              (LEX of the main query)
+                        |
+                  (select_lex)
+                  /           \
+       (FROM table list)   (WHERE clause)= (f1 = f2)
+                 |
+          (join list)= t2, (t1, t3)
+                                   \
+                                 (on_expr)= (f1 = f3 and f1 < 1)
+
+  In this example table numbers are assigned as follows:
+    (outer select): tt - 1, t2 - 2
+    (inner select): t1 - 1, t3 - 2
+  After the merge table numbers will be:
+    (outer select): t1 - 1, t2 - 2, t3 - 3
+
+  @return FALSE if derived table/view were successfully merged.
+  @return TRUE if an error occur.
+*/
+
+bool mysql_derived_merge(THD *thd, LEX *lex, TABLE_LIST *derived)
+{
+  bool res= FALSE;
+  SELECT_LEX *dt_select= derived->get_single_select();
+  table_map map;
+  uint tablenr;
+  SELECT_LEX *parent_lex= derived->select_lex;
+  Query_arena *arena, backup;
+
+  if (derived->merged)
+    return FALSE;
+
+ if (thd->lex->sql_command == SQLCOM_UPDATE_MULTI ||
+     thd->lex->sql_command == SQLCOM_DELETE_MULTI)
+   thd->save_prep_leaf_list= TRUE;
+
+  arena= thd->activate_stmt_arena_if_needed(&backup);  // For easier test
+  derived->merged= TRUE;
+
+  if (!derived->merged_for_insert || 
+      (derived->is_multitable() && 
+       (thd->lex->sql_command == SQLCOM_UPDATE_MULTI ||
+        thd->lex->sql_command == SQLCOM_DELETE_MULTI)))
+  {
+    /*
+      Check whether there is enough free bits in table map to merge subquery.
+      If not - materialize it. This check isn't cached so when there is a big
+      and small subqueries, and the bigger one can't be merged it wouldn't
+      block the smaller one.
+    */
+    if (parent_lex->get_free_table_map(&map, &tablenr))
+    {
+      /* There is no enough table bits, fall back to materialization. */
+      derived->change_refs_to_fields();
+      derived->set_materialized_derived();
+      goto exit_merge;
+    }
+
+    if (dt_select->leaf_tables.elements + tablenr > MAX_TABLES)
+    {
+      /* There is no enough table bits, fall back to materialization. */
+      derived->change_refs_to_fields();
+      derived->set_materialized_derived();
+      goto exit_merge;
+    }
+
+    if (dt_select->options & OPTION_SCHEMA_TABLE)
+      parent_lex->options |= OPTION_SCHEMA_TABLE;
+
+    parent_lex->cond_count+= dt_select->cond_count;
+
+    if (!derived->get_unit()->prepared)
+    {
+      dt_select->leaf_tables.empty();
+      make_leaves_list(dt_select->leaf_tables, derived, TRUE, 0);
+    } 
+
+    derived->nested_join= (NESTED_JOIN*) thd->calloc(sizeof(NESTED_JOIN));
+    if (!derived->nested_join)
+    {
+      res= TRUE;
+      goto exit_merge;
+    }
+
+    /* Merge derived table's subquery in the parent select. */
+    if (parent_lex->merge_subquery(thd, derived, dt_select, tablenr, map))
+    {
+      res= TRUE;
+      goto exit_merge;
+    }
+
+    /*
+      exclude select lex so it doesn't show up in explain.
+      do this only for derived table as for views this is already done.
+
+      From sql_view.cc
+        Add subqueries units to SELECT into which we merging current view.
+        unit(->next)* chain starts with subqueries that are used by this
+        view and continues with subqueries that are used by other views.
+        We must not add any subquery twice (otherwise we'll form a loop),
+        to do this we remember in end_unit the first subquery that has
+        been already added.
+    */
+    derived->get_unit()->exclude_level();
+    if (parent_lex->join) 
+      parent_lex->join->table_count+= dt_select->join->table_count - 1;
+  }
+  if (derived->get_unit()->prepared)
+  {
+    Item *expr= derived->on_expr;
+    expr= and_conds(expr, dt_select->join ? dt_select->join->conds : 0);
+    if (expr && (derived->prep_on_expr || expr != derived->on_expr))
+    {
+      derived->on_expr= expr;
+      derived->prep_on_expr= expr->copy_andor_structure(thd);
+    }
+    if (derived->on_expr &&
+        ((!derived->on_expr->fixed &&
+          derived->on_expr->fix_fields(thd, &derived->on_expr)) ||
+          derived->on_expr->check_cols(1)))
+    {
+      res= TRUE; /* purecov: inspected */
+      goto exit_merge;
+    }
+    // Update used tables cache according to new table map
+    if (derived->on_expr)
+    {
+      derived->on_expr->fix_after_pullout(parent_lex, &derived->on_expr);
+      fix_list_after_tbl_changes(parent_lex, &derived->nested_join->join_list);
+    }
+  }
+
+exit_merge:
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+  return res;
+}
+
+
+/**
+  Merge a view for the embedding INSERT/UPDATE/DELETE
+
+  @param thd     thread handle
+  @param lex     LEX of the embedding query.
+  @param derived reference to the derived table.
+
+  @details
+  This function substitutes the derived table for the first table from
+  the query of the derived table thus making it a correct target table for the
+  INSERT/UPDATE/DELETE statements. As this operation is correct only for
+  single table views only, for multi table views this function does nothing.
+  The derived parameter isn't checked to be a view as derived tables aren't
+  allowed for INSERT/UPDATE/DELETE statements.
+
+  @return FALSE if derived table/view were successfully merged.
+  @return TRUE if an error occur.
+*/
+
+bool mysql_derived_merge_for_insert(THD *thd, LEX *lex, TABLE_LIST *derived)
+{
+  if (derived->merged_for_insert)
+    return FALSE;
+  if (derived->is_materialized_derived())
+    return mysql_derived_prepare(thd, lex, derived);
+  if (!derived->is_multitable())
+  {
+    if (!derived->updatable)
+      return derived->create_field_translation(thd);
+    if (derived->merge_underlying_list)
+    {
+      derived->table= derived->merge_underlying_list->table;
+      derived->schema_table= derived->merge_underlying_list->schema_table;
+      derived->merged_for_insert= TRUE;
+    }
+  }  
+  return FALSE;
+}
+
+
+/*
+  Initialize a derived table/view
+
+  @param thd	     Thread handle
+  @param lex         LEX of the embedding query.
+  @param derived     reference to the derived table.
+
+  @detail
+  Fill info about derived table/view without preparing an
+  underlying select. Such as: create a field translation for views, mark it as
+  a multitable if it is and so on.
+
+  @return
+    false  OK
+    true   Error
+*/
 
+
+bool mysql_derived_init(THD *thd, LEX *lex, TABLE_LIST *derived)
+{
+  SELECT_LEX_UNIT *unit= derived->get_unit();
+  DBUG_ENTER("mysql_derived_init");
+
+  // Skip already prepared views/DT
+  if (!unit || unit->prepared)
+    DBUG_RETURN(FALSE);
+
+  DBUG_RETURN(derived->init_derived(thd, TRUE));
+}
+
+
+/*
+  Create temporary table structure (but do not fill it)
+
+  @param thd	     Thread handle
+  @param lex         LEX of the embedding query.
+  @param derived     reference to the derived table.
+
+  @detail
+  Prepare underlying select for a derived table/view. To properly resolve
+  names in the embedding query the TABLE structure is created. Actual table
+  is created later by the mysql_derived_create function.
+
+  This function is called before any command containing derived table
+  is executed. All types of derived tables are handled by this function:
+  - Anonymous derived tables, or
+  - Named derived tables (aka views).
+
+  The table reference, contained in @c derived, is updated with the
+  fields of a new temporary table.
   Derived tables are stored in @c thd->derived_tables and closed by
   close_thread_tables().
 
@@ -118,211 +568,362 @@ out:
   the state of privilege checking (GRANT_INFO struct) is copied as-is to the
   temporary table.
 
-  This function implements a signature called "derived table processor", and
-  is passed as a function pointer to mysql_handle_derived().
+  Only the TABLE structure is created here, actual table is created by the
+  mysql_derived_create function.
 
   @note This function sets @c SELECT_ACL for @c TEMPTABLE views as well as
   anonymous derived tables, but this is ok since later access checking will
   distinguish between them.
 
-  @see mysql_handle_derived(), mysql_derived_filling(), GRANT_INFO
+  @see mysql_handle_derived(), mysql_derived_fill(), GRANT_INFO
 
   @return
     false  OK
     true   Error
 */
 
-bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *orig_table_list)
+bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *derived)
 {
-  SELECT_LEX_UNIT *unit= orig_table_list->derived;
-  ulonglong create_options;
+  SELECT_LEX_UNIT *unit= derived->get_unit();
   DBUG_ENTER("mysql_derived_prepare");
   bool res= FALSE;
-  if (unit)
+
+  // Skip already prepared views/DT
+  if (!unit || unit->prepared ||
+      (derived->merged_for_insert && 
+       !(derived->is_multitable() &&
+         (thd->lex->sql_command == SQLCOM_UPDATE_MULTI ||
+          thd->lex->sql_command == SQLCOM_DELETE_MULTI))))
+    DBUG_RETURN(FALSE);
+
+  Query_arena *arena= thd->stmt_arena, backup;
+  if (arena->is_conventional())
+    arena= 0;                                   // For easier test
+  else
+    thd->set_n_backup_active_arena(arena, &backup);
+
+  SELECT_LEX *first_select= unit->first_select();
+
+  /* prevent name resolving out of derived table */
+  for (SELECT_LEX *sl= first_select; sl; sl= sl->next_select())
   {
-    SELECT_LEX *first_select= unit->first_select();
-    TABLE *table= 0;
-    select_union *derived_result;
-
-    /* prevent name resolving out of derived table */
-    for (SELECT_LEX *sl= first_select; sl; sl= sl->next_select())
-      sl->context.outer_context= 0;
-
-    if (!(derived_result= new select_union))
-      DBUG_RETURN(TRUE); // out of memory
-
-    lex->context_analysis_only|= CONTEXT_ANALYSIS_ONLY_DERIVED;
-    // st_select_lex_unit::prepare correctly work for single select
-    if ((res= unit->prepare(thd, derived_result, 0)))
-      goto exit;
-    lex->context_analysis_only&= ~CONTEXT_ANALYSIS_ONLY_DERIVED;
-    if ((res= check_duplicate_names(unit->types, 0)))
-      goto exit;
-
-    create_options= (first_select->options | thd->variables.option_bits |
-                     TMP_TABLE_ALL_COLUMNS);
-    /*
-      Temp table is created so that it hounours if UNION without ALL is to be 
-      processed
-
-      As 'distinct' parameter we always pass FALSE (0), because underlying
-      query will control distinct condition by itself. Correct test of
-      distinct underlying query will be is_union &&
-      !unit->union_distinct->next_select() (i.e. it is union and last distinct
-      SELECT is last SELECT of UNION).
-    */
-    if ((res= derived_result->create_result_table(thd, &unit->types, FALSE,
-                                                 create_options,
-                                                 orig_table_list->alias,
-                                                 FALSE)))
-      goto exit;
+    sl->context.outer_context= 0;
+    // Prepare underlying views/DT first.
+    sl->handle_derived(lex, DT_PREPARE);
+  }
 
-    table= derived_result->table;
+  unit->derived= derived;
+
+  if (!(derived->derived_result= new select_union))
+    DBUG_RETURN(TRUE); // out of memory
+
+  lex->context_analysis_only|= CONTEXT_ANALYSIS_ONLY_DERIVED;
+  // st_select_lex_unit::prepare correctly work for single select
+  if ((res= unit->prepare(thd, derived->derived_result, 0)))
+    goto exit;
+  lex->context_analysis_only&= ~CONTEXT_ANALYSIS_ONLY_DERIVED;
+  if ((res= check_duplicate_names(unit->types, 0)))
+    goto exit;
+
+  /*
+    Check whether we can merge this derived table into main select.
+    Depending on the result field translation will or will not
+    be created.
+  */
+  if (derived->init_derived(thd, FALSE))
+    goto exit;
+
+  /*
+    Temp table is created so that it hounours if UNION without ALL is to be 
+    processed
+
+    As 'distinct' parameter we always pass FALSE (0), because underlying
+    query will control distinct condition by itself. Correct test of
+    distinct underlying query will be is_union &&
+    !unit->union_distinct->next_select() (i.e. it is union and last distinct
+    SELECT is last SELECT of UNION).
+  */
+  thd->create_tmp_table_for_derived= TRUE;
+  if (derived->derived_result->create_result_table(thd, &unit->types, FALSE,
+                                                (first_select->options |
+                                                 thd->variables.option_bits |
+                                                 TMP_TABLE_ALL_COLUMNS),
+                                                derived->alias,
+                                                FALSE, FALSE))
+  { 
+    thd->create_tmp_table_for_derived= FALSE;
+    goto exit;
+  }
+  thd->create_tmp_table_for_derived= FALSE;
+
+  derived->table= derived->derived_result->table;
+  if (derived->is_derived() && derived->is_merged_derived())
+    first_select->mark_as_belong_to_derived(derived);
 
 exit:
-    /* Hide "Unknown column" or "Unknown function" error */
-    if (orig_table_list->view)
+  /* Hide "Unknown column" or "Unknown function" error */
+  if (derived->view)
+  {
+    if (thd->is_error() &&
+        (thd->stmt_da->sql_errno() == ER_BAD_FIELD_ERROR ||
+        thd->stmt_da->sql_errno() == ER_FUNC_INEXISTENT_NAME_COLLISION ||
+        thd->stmt_da->sql_errno() == ER_SP_DOES_NOT_EXIST))
     {
-      if (thd->is_error() &&
-          (thd->stmt_da->sql_errno() == ER_BAD_FIELD_ERROR ||
-          thd->stmt_da->sql_errno() == ER_FUNC_INEXISTENT_NAME_COLLISION ||
-          thd->stmt_da->sql_errno() == ER_SP_DOES_NOT_EXIST))
-      {
-        thd->clear_error();
-        my_error(ER_VIEW_INVALID, MYF(0), orig_table_list->db,
-                 orig_table_list->table_name);
-      }
+      thd->clear_error();
+      my_error(ER_VIEW_INVALID, MYF(0), derived->db,
+               derived->table_name);
     }
+  }
 
-    /*
-      if it is preparation PS only or commands that need only VIEW structure
-      then we do not need real data and we can skip execution (and parameters
-      is not defined, too)
-    */
-    if (res)
-    {
-      if (table)
-	free_tmp_table(thd, table);
-      delete derived_result;
-    }
+  /*
+    if it is preparation PS only or commands that need only VIEW structure
+    then we do not need real data and we can skip execution (and parameters
+    is not defined, too)
+  */
+  if (res)
+  {
+    if (derived->table)
+      free_tmp_table(thd, derived->table);
+    delete derived->derived_result;
+  }
+  else
+  {
+    TABLE *table= derived->table;
+    table->derived_select_number= first_select->select_number;
+    table->s->tmp_table= NON_TRANSACTIONAL_TMP_TABLE;
+#ifndef NO_EMBEDDED_ACCESS_CHECKS
+    if (derived->referencing_view)
+      table->grant= derived->grant;
     else
     {
-      if (!thd->fill_derived_tables())
-      {
-	delete derived_result;
-	derived_result= NULL;
-      }
-      orig_table_list->derived_result= derived_result;
-      orig_table_list->table= table;
-      orig_table_list->table_name=        table->s->table_name.str;
-      orig_table_list->table_name_length= table->s->table_name.length;
-      table->derived_select_number= first_select->select_number;
-      table->s->tmp_table= NON_TRANSACTIONAL_TMP_TABLE;
-#ifndef NO_EMBEDDED_ACCESS_CHECKS
-      if (orig_table_list->referencing_view)
-        table->grant= orig_table_list->grant;
-      else
-        table->grant.privilege= SELECT_ACL;
-#endif
-      orig_table_list->db= (char *)"";
-      orig_table_list->db_length= 0;
-      // Force read of table stats in the optimizer
-      table->file->info(HA_STATUS_VARIABLE);
-      /* Add new temporary table to list of open derived tables */
-      table->next= thd->derived_tables;
-      thd->derived_tables= table;
+      table->grant.privilege= SELECT_ACL;
+      if (derived->is_derived())
+        derived->grant.privilege= SELECT_ACL;
     }
+#endif
+    /* Add new temporary table to list of open derived tables */
+    table->next= thd->derived_tables;
+    thd->derived_tables= table;
   }
-  else if (orig_table_list->merge_underlying_list)
-    orig_table_list->set_underlying_merge();
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
   DBUG_RETURN(res);
 }
 
 
-/*
-  fill derived table
-
-  SYNOPSIS
-    mysql_derived_filling()
-    thd			Thread handle
-    lex                 LEX for this thread
-    unit                node that contains all SELECT's for derived tables
-    orig_table_list     TABLE_LIST for the upper SELECT
-
-  IMPLEMENTATION
-    Derived table is resolved with temporary table. It is created based on the
-    queries defined. After temporary table is filled, if this is not EXPLAIN,
-    then the entire unit / node is deleted. unit is deleted if UNION is used
-    for derived table and node is deleted is it is a  simple SELECT.
-    If you use this function, make sure it's not called at prepare.
-    Due to evaluation of LIMIT clause it can not be used at prepared stage.
-
-  RETURN
-    FALSE  OK
-    TRUE   Error
+/**
+  Runs optimize phase for a derived table/view.
+
+  @param thd     thread handle
+  @param lex     LEX of the embedding query.
+  @param derived reference to the derived table.
+
+  @details
+  Runs optimize phase for given 'derived' derived table/view.
+  If optimizer finds out that it's of the type "SELECT a_constant" then this
+  functions also materializes it.
+
+  @return FALSE ok.
+  @return TRUE if an error occur.
 */
 
-bool mysql_derived_filling(THD *thd, LEX *lex, TABLE_LIST *orig_table_list)
+bool mysql_derived_optimize(THD *thd, LEX *lex, TABLE_LIST *derived)
 {
-  TABLE *table= orig_table_list->table;
-  SELECT_LEX_UNIT *unit= orig_table_list->derived;
+  SELECT_LEX_UNIT *unit= derived->get_unit();
+  SELECT_LEX *first_select= unit->first_select();
+  SELECT_LEX *save_current_select= lex->current_select;
+
   bool res= FALSE;
 
-  /*check that table creation pass without problem and it is derived table */
-  if (table && unit)
+  if (unit->optimized)
+    return FALSE;
+  lex->current_select= first_select;
+
+  if (unit->is_union())
   {
-    SELECT_LEX *first_select= unit->first_select();
-    select_union *derived_result= orig_table_list->derived_result;
-    SELECT_LEX *save_current_select= lex->current_select;
-    if (unit->is_union())
-    {
-      // execute union without clean up
-      res= unit->exec();
-    }
-    else
+    // optimize union without execution
+    res= unit->optimize();
+  }
+  else if (unit->derived)
+  {
+    if (!derived->is_merged_derived())
     {
-      unit->set_limit(first_select);
-      if (unit->select_limit_cnt == HA_POS_ERROR)
-	first_select->options&= ~OPTION_FOUND_ROWS;
-
-      lex->current_select= first_select;
-      res= mysql_select(thd, &first_select->ref_pointer_array,
-			first_select->table_list.first,
-			first_select->with_wild,
-			first_select->item_list, first_select->where,
-			(first_select->order_list.elements+
-			 first_select->group_list.elements),
-			first_select->order_list.first,
-			first_select->group_list.first,
-			first_select->having, (ORDER*) NULL,
-			(first_select->options | thd->variables.option_bits |
-			 SELECT_NO_UNLOCK),
-			derived_result, unit, first_select);
+      JOIN *join= first_select->join;
+      unit->optimized= TRUE;
+      if ((res= join->optimize()))
+        goto err;
+      if (join->table_count == join->const_tables)
+        derived->fill_me= TRUE;
     }
-
-    if (!res)
+  }
+  /*
+    Materialize derived tables/views of the "SELECT a_constant" type.
+    Such tables should be materialized at the optimization phase for
+    correct constant evaluation.
+  */
+  if (!res && derived->fill_me && !derived->merged_for_insert)
+  {
+    if (derived->is_merged_derived())
     {
-      /*
-        Here we entirely fix both TABLE_LIST and list of SELECT's as
-        there were no derived tables
-      */
-      if (derived_result->flush())
-        res= TRUE;
+      derived->change_refs_to_fields();
+      derived->set_materialized_derived();
     }
-    lex->current_select= save_current_select;
+    if ((res= mysql_derived_create(thd, lex, derived)))
+      goto err;
+    if ((res= mysql_derived_fill(thd, lex, derived)))
+      goto err;
   }
+err:
+  lex->current_select= save_current_select;
   return res;
 }
 
 
 /**
-   Cleans up the SELECT_LEX_UNIT for the derived table (if any).
+  Actually create result table for a materialized derived table/view.
+
+  @param thd     thread handle
+  @param lex     LEX of the embedding query.
+  @param derived reference to the derived table.
+
+  @details
+  This function actually creates the result table for given 'derived'
+  table/view, but it doesn't fill it.
+  'thd' and 'lex' parameters are not used  by this function.
+
+  @return FALSE ok.
+  @return TRUE if an error occur.
+*/
+
+bool mysql_derived_create(THD *thd, LEX *lex, TABLE_LIST *derived)
+{
+  TABLE *table= derived->table;
+  SELECT_LEX_UNIT *unit= derived->get_unit();
+
+  if (table->created)
+    return FALSE;
+  select_union *result= (select_union*)unit->result;
+  if (table->s->db_type() == TMP_ENGINE_HTON)
+  {
+    if (create_internal_tmp_table(table, result->tmp_table_param.keyinfo,
+                                  result->tmp_table_param.start_recinfo,
+                                  &result->tmp_table_param.recinfo,
+                                  (unit->first_select()->options |
+                                   thd->variables.option_bits | TMP_TABLE_ALL_COLUMNS),
+                                  thd->variables.big_tables))
+      return(TRUE);
+  }
+  if (open_tmp_table(table))
+    return TRUE;
+  table->file->extra(HA_EXTRA_WRITE_CACHE);
+  table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+  return FALSE;
+}
+
+
+/*
+  Execute subquery of a materialized derived table/view and fill the result
+  table.
+
+  @param thd      Thread handle
+  @param lex      LEX for this thread
+  @param derived  reference to the derived table.
+
+  @details
+  Execute subquery of given 'derived' table/view and fill the result
+  table. After result table is filled, if this is not the EXPLAIN statement,
+  the entire unit / node is deleted. unit is deleted if UNION is used
+  for derived table and node is deleted is it is a simple SELECT.
+  'lex' is unused and 'thd' is passed as an argument to an underlying function.
+
+  @note
+  If you use this function, make sure it's not called at prepare.
+  Due to evaluation of LIMIT clause it can not be used at prepared stage.
+
+  @return FALSE  OK
+  @return TRUE   Error
 */
 
-bool mysql_derived_cleanup(THD *thd, LEX *lex, TABLE_LIST *derived)
+bool mysql_derived_fill(THD *thd, LEX *lex, TABLE_LIST *derived)
 {
-  SELECT_LEX_UNIT *unit= derived->derived;
-  if (unit)
+  SELECT_LEX_UNIT *unit= derived->get_unit();
+  bool res= FALSE;
+
+  if (unit->executed && !unit->uncacheable && !unit->describe)
+    return FALSE;
+  /*check that table creation passed without problems. */
+  DBUG_ASSERT(derived->table && derived->table->created);
+  SELECT_LEX *first_select= unit->first_select();
+  select_union *derived_result= derived->derived_result;
+  SELECT_LEX *save_current_select= lex->current_select;
+  if (unit->is_union())
+  {
+    // execute union without clean up
+    res= unit->exec();
+  }
+  else
+  {
+    unit->set_limit(first_select);
+    if (unit->select_limit_cnt == HA_POS_ERROR)
+      first_select->options&= ~OPTION_FOUND_ROWS;
+
+    lex->current_select= first_select;
+    res= mysql_select(thd, &first_select->ref_pointer_array,
+                      first_select->table_list.first,
+                      first_select->with_wild,
+                      first_select->item_list, first_select->where,
+                      (first_select->order_list.elements+
+                       first_select->group_list.elements),
+                      first_select->order_list.first,
+                      first_select->group_list.first,
+                      first_select->having, (ORDER*) NULL,
+                      (first_select->options |thd->variables.option_bits |
+                       SELECT_NO_UNLOCK),
+                      derived_result, unit, first_select);
+  }
+
+  if (!res)
+  {
+    if (derived_result->flush())
+      res= TRUE;
+    unit->executed= TRUE;
+  }
+  if (res || !lex->describe) 
     unit->cleanup();
-  return false;
+  lex->current_select= save_current_select;
+
+  return res;
 }
+
+
+/**
+  Re-initialize given derived table/view for the next execution.
+
+  @param  thd         thread handle
+  @param  lex         LEX for this thread
+  @param  derived     reference to the derived table.
+
+  @details
+  Re-initialize given 'derived' table/view for the next execution.
+  All underlying views/derived tables are recursively reinitialized prior
+  to re-initialization of given derived table.
+  'thd' and 'lex' are passed as arguments to called functions.
+
+  @return FALSE  OK
+  @return TRUE   Error
+*/
+
+bool mysql_derived_reinit(THD *thd, LEX *lex, TABLE_LIST *derived)
+{
+  st_select_lex_unit *unit= derived->get_unit();
+
+  if (derived->table)
+    derived->merged_for_insert= FALSE;
+  unit->unclean();
+  unit->types.empty();
+  /* for derived tables & PS (which can't be reset by Item_subquery) */
+  unit->reinit_exec_mechanism();
+  unit->set_thd(thd);
+  return FALSE;
+}
+
diff --git a/sql/sql_derived.h b/sql/sql_derived.h
index 11a6fd4105e..e96c363d5f5 100644
--- a/sql/sql_derived.h
+++ b/sql/sql_derived.h
@@ -20,11 +20,9 @@ struct TABLE_LIST;
 class THD;
 struct LEX;
 
-bool mysql_handle_derived(LEX *lex, bool (*processor)(THD *thd,
-                                                      LEX *lex,
-                                                      TABLE_LIST *table));
-bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *t);
-bool mysql_derived_filling(THD *thd, LEX *lex, TABLE_LIST *t);
+bool mysql_handle_derived(LEX *lex, uint phases);
+bool mysql_handle_single_derived(LEX *lex, TABLE_LIST *derived, uint phases);
+bool mysql_handle_list_of_derived(LEX *lex, TABLE_LIST *dt_list, uint phases);
 
 /**
    Cleans up the SELECT_LEX_UNIT for the derived table (if any).
diff --git a/sql/sql_error.h b/sql/sql_error.h
index 5b9b5ee639a..00ade934226 100644
--- a/sql/sql_error.h
+++ b/sql/sql_error.h
@@ -504,34 +504,76 @@ private:
 extern char *err_conv(char *buff, uint to_length, const char *from,
                       uint from_length, CHARSET_INFO *from_cs);
 
-class ErrConvString
+class ErrConv
 {
-  char err_buffer[MYSQL_ERRMSG_SIZE];
+protected:
+  mutable char err_buffer[MYSQL_ERRMSG_SIZE];
 public:
+  ErrConv() {}
+  virtual ~ErrConv() {}
+  virtual const char *ptr() const = 0;
+};
+
+class ErrConvString : public ErrConv
+{
+  const char *str;
+  size_t len;
+  CHARSET_INFO *cs;
+public:
+  ErrConvString(const char *str_arg, size_t len_arg, CHARSET_INFO *cs_arg)
+    : ErrConv(), str(str_arg), len(len_arg), cs(cs_arg) {}
+  ErrConvString(String *s)
+    : ErrConv(), str(s->ptr()), len(s->length()), cs(s->charset()) {}
+  const char *ptr() const
+  { return err_conv(err_buffer, sizeof(err_buffer), str, len, cs); }
+};
 
-  ErrConvString(String *str)
+class ErrConvInteger : public ErrConv
+{
+  longlong num;
+public:
+  ErrConvInteger(longlong num_arg) : ErrConv(), num(num_arg) {}
+  const char *ptr() const
+  { return llstr(num, err_buffer); }
+};
+
+class ErrConvDouble: public ErrConv
+{
+  double num;
+public:
+  ErrConvDouble(double num_arg) : ErrConv(), num(num_arg) {}
+  const char *ptr() const
   {
-    (void) err_conv(err_buffer, sizeof(err_buffer), str->ptr(),
-                    str->length(), str->charset());
+    my_gcvt(num, MY_GCVT_ARG_DOUBLE, sizeof(err_buffer), err_buffer, 0);
+    return err_buffer;
   }
+};
 
-  ErrConvString(const char *str, CHARSET_INFO* cs)
+class ErrConvTime : public ErrConv
+{
+  const MYSQL_TIME *ltime;
+public:
+  ErrConvTime(const MYSQL_TIME *ltime_arg) : ErrConv(), ltime(ltime_arg) {}
+  const char *ptr() const
   {
-    (void) err_conv(err_buffer, sizeof(err_buffer),
-                    str, strlen(str), cs);
+    my_TIME_to_str(ltime, err_buffer, AUTO_SEC_PART_DIGITS);
+    return err_buffer;
   }
+};
 
-  ErrConvString(const char *str, uint length, CHARSET_INFO* cs)
+class ErrConvDecimal : public ErrConv
+{
+  const decimal_t *d;
+public:
+  ErrConvDecimal(const decimal_t *d_arg) : ErrConv(), d(d_arg) {}
+  const char *ptr() const
   {
-    (void) err_conv(err_buffer, sizeof(err_buffer),
-                    str, length, cs);
+    int len= sizeof(err_buffer);
+    decimal2string(d, err_buffer, &len, 0, 0, ' ');
+    return err_buffer;
   }
-
-  ~ErrConvString() { };
-  char *ptr() { return err_buffer; }
 };
 
-
 void push_warning(THD *thd, MYSQL_ERROR::enum_warning_level level,
                   uint code, const char *msg);
 void push_warning_printf(THD *thd, MYSQL_ERROR::enum_warning_level level,
diff --git a/sql/sql_expression_cache.cc b/sql/sql_expression_cache.cc
index d91868ca916..3be6dea7df9 100644
--- a/sql/sql_expression_cache.cc
+++ b/sql/sql_expression_cache.cc
@@ -1,8 +1,38 @@
+/* Copyright (C) 2010-2011 Monty Program Ab & Oleksandr Byelkin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
 
 #include "sql_base.h"
 #include "sql_select.h"
 #include "sql_expression_cache.h"
 
+/**
+  Minimum hit ration to proceed on disk if in memory table overflowed.
+  hit_rate = hit / (miss + hit);
+*/
+#define EXPCACHE_MIN_HIT_RATE_FOR_DISK_TABLE 0.7
+/**
+  Minimum hit ratio to keep in memory table (do not switch cache off)
+  hit_rate = hit / (miss + hit);
+*/
+#define EXPCACHE_MIN_HIT_RATE_FOR_MEM_TABLE  0.2
+/**
+  Number of cache miss to check hit ratio (maximum cache performance
+  impact in the case when the cache is not applicable)
+*/
+#define EXPCACHE_CHECK_HIT_RATIO_AFTER 200
+
 /*
   Expression cache is used only for caching subqueries now, so its statistic
   variables we call subquery_cache*.
@@ -10,10 +40,10 @@
 ulong subquery_cache_miss, subquery_cache_hit;
 
 Expression_cache_tmptable::Expression_cache_tmptable(THD *thd,
-                                                 List<Item*> &dependants,
-                                                 Item *value)
-  :cache_table(NULL), table_thd(thd), list(&dependants), val(value),
-   inited (0)
+                                                     List<Item> &dependants,
+                                                     Item *value)
+  :cache_table(NULL), table_thd(thd), items(dependants), val(value),
+   hit(0), miss(0), inited (0)
 {
   DBUG_ENTER("Expression_cache_tmptable::Expression_cache_tmptable");
   DBUG_VOID_RETURN;
@@ -21,6 +51,18 @@ Expression_cache_tmptable::Expression_cache_tmptable(THD *thd,
 
 
 /**
+  Disable cache
+*/
+
+void Expression_cache_tmptable::disable_cache()
+{
+  cache_table->file->ha_index_end();
+  free_tmp_table(table_thd, cache_table);
+  cache_table= NULL;
+}
+
+
+/**
   Field enumerator for TABLE::add_tmp_key
 
   @param arg             reference variable with current field number
@@ -47,56 +89,38 @@ static uint field_enumerator(uchar *arg)
 
 void Expression_cache_tmptable::init()
 {
-  List_iterator<Item*> li(*list);
-  Item_iterator_ref_list it(li);
-  Item **item;
+  List_iterator<Item> li(items);
+  Item_iterator_list it(li);
   uint field_counter;
   DBUG_ENTER("Expression_cache_tmptable::init");
   DBUG_ASSERT(!inited);
   inited= TRUE;
   cache_table= NULL;
 
-  while ((item= li++))
-  {
-    DBUG_ASSERT(item);
-    if (*item)
-    {
-      DBUG_ASSERT((*item)->fixed);
-      items.push_back((*item));
-    }
-    else
-    {
-      /*
-        This is possible when optimizer already executed this subquery and
-        optimized out the condition predicate.
-      */
-      li.remove();
-    }
-  }
-
-  if (list->elements == 0)
+  if (items.elements == 0)
   {
     DBUG_PRINT("info", ("All parameters were removed by optimizer."));
     DBUG_VOID_RETURN;
   }
 
+  /* add result field */
+  items.push_front(val);
+
   cache_table_param.init();
   /* dependent items and result */
-  cache_table_param.field_count= list->elements + 1;
+  cache_table_param.field_count= items.elements;
   /* postpone table creation to index description */
   cache_table_param.skip_create_table= 1;
-  cache_table= NULL;
-
-  items.push_front(val);
 
   if (!(cache_table= create_tmp_table(table_thd, &cache_table_param,
                                       items, (ORDER*) NULL,
-                                      FALSE, FALSE,
+                                      FALSE, TRUE,
                                       ((table_thd->variables.option_bits |
                                         TMP_TABLE_ALL_COLUMNS) &
                                         ~TMP_TABLE_FORCE_MYISAM),
                                       HA_POS_ERROR,
-                                      (char *)"subquery-cache-table")))
+                                      (char *)"subquery-cache-table",
+                                      TRUE)))
   {
     DBUG_PRINT("error", ("create_tmp_table failed, caching switched off"));
     DBUG_VOID_RETURN;
@@ -108,22 +132,18 @@ void Expression_cache_tmptable::init()
     goto error;
   }
 
-  /* This list do not contain result field */
-  it.open();
-
-  field_counter=1;
+  field_counter= 1;
 
   if (cache_table->alloc_keys(1) ||
       cache_table->add_tmp_key(0, items.elements - 1, &field_enumerator,
                                 (uchar*)&field_counter, TRUE) ||
       ref.tmp_table_index_lookup_init(table_thd, cache_table->key_info, it,
-                                      TRUE))
+                                      TRUE, 1 /* skip result field*/))
   {
     DBUG_PRINT("error", ("creating index failed"));
     goto error;
   }
   cache_table->s->keys= 1;
-  cache_table->s->uniques= 1;
   ref.null_rejecting= 1;
   ref.disable_cache= FALSE;
   ref.has_record= 0;
@@ -145,20 +165,19 @@ void Expression_cache_tmptable::init()
   DBUG_VOID_RETURN;
 
 error:
-  /* switch off cache */
-  free_tmp_table(table_thd, cache_table);
-  cache_table= NULL;
+  disable_cache();
   DBUG_VOID_RETURN;
 }
 
 
 Expression_cache_tmptable::~Expression_cache_tmptable()
 {
+  /* Add accumulated statistics */
+  statistic_add(subquery_cache_miss, miss, &LOCK_status);
+  statistic_add(subquery_cache_hit, hit, &LOCK_status);
+
   if (cache_table)
-  {
-    cache_table->file->ha_index_end();
-    free_tmp_table(table_thd, cache_table);
-  }
+    disable_cache();
 }
 
 
@@ -182,26 +201,28 @@ Expression_cache::result Expression_cache_tmptable::check_value(Item **value)
   int res;
   DBUG_ENTER("Expression_cache_tmptable::check_value");
 
-  /*
-    We defer cache initialization to get item references that are
-    used at the execution phase.
-  */
-  if (!inited)
-    init();
-
   if (cache_table)
   {
     DBUG_PRINT("info", ("status: %u  has_record %u",
                         (uint)cache_table->status, (uint)ref.has_record));
     if ((res= join_read_key2(table_thd, NULL, cache_table, &ref)) == 1)
       DBUG_RETURN(ERROR);
+
     if (res)
     {
-      subquery_cache_miss++;
+      if (((++miss) == EXPCACHE_CHECK_HIT_RATIO_AFTER) &&
+          ((double)hit / ((double)hit + miss)) <
+          EXPCACHE_MIN_HIT_RATE_FOR_MEM_TABLE)
+      {
+        DBUG_PRINT("info",
+                   ("Early check: hit rate is not so good to keep the cache"));
+        disable_cache();
+      }
+
       DBUG_RETURN(MISS);
     }
 
-    subquery_cache_hit++;
+    hit++;
     *value= cached_result;
     DBUG_RETURN(Expression_cache::HIT);
   }
@@ -239,15 +260,37 @@ my_bool Expression_cache_tmptable::put_value(Item *value)
   if (table_thd->is_error())
     goto err;;
 
-  if ((error= cache_table->file->ha_write_row(cache_table->record[0])))
+  if ((error= cache_table->file->ha_write_tmp_row(cache_table->record[0])))
   {
     /* create_myisam_from_heap will generate error if needed */
-    if (cache_table->file->is_fatal_error(error, HA_CHECK_DUP) &&
-        create_internal_tmp_table_from_heap(table_thd, cache_table,
-                                            cache_table_param.start_recinfo,
-                                            &cache_table_param.recinfo,
-                                            error, 1))
+    if (cache_table->file->is_fatal_error(error, HA_CHECK_DUP))
       goto err;
+    else
+    {
+      double hit_rate= ((double)hit / ((double)hit + miss));
+      DBUG_ASSERT(miss > 0);
+      if (hit_rate < EXPCACHE_MIN_HIT_RATE_FOR_MEM_TABLE)
+      {
+        DBUG_PRINT("info", ("hit rate is not so good to keep the cache"));
+        disable_cache();
+        DBUG_RETURN(FALSE);
+      }
+      else if (hit_rate < EXPCACHE_MIN_HIT_RATE_FOR_DISK_TABLE)
+      {
+        DBUG_PRINT("info", ("hit rate is not so good to go to disk"));
+        if (cache_table->file->ha_delete_all_rows() ||
+            cache_table->file->ha_write_tmp_row(cache_table->record[0]))
+          goto err;
+      }
+      else
+      {
+        if (create_internal_tmp_table_from_heap(table_thd, cache_table,
+                                                cache_table_param.start_recinfo,
+                                                &cache_table_param.recinfo,
+                                                error, 1))
+          goto err;
+      }
+    }
   }
   cache_table->status= 0; /* cache_table->record contains an existed record */
   ref.has_record= TRUE; /* the same as above */
@@ -256,25 +299,24 @@ my_bool Expression_cache_tmptable::put_value(Item *value)
   DBUG_RETURN(FALSE);
 
 err:
-  cache_table->file->ha_index_end();
-  free_tmp_table(table_thd, cache_table);
-  cache_table= NULL;
+  disable_cache();
   DBUG_RETURN(TRUE);
 }
 
 
 void Expression_cache_tmptable::print(String *str, enum_query_type query_type)
 {
-  List_iterator<Item*> li(*list);
-  Item **item;
+  List_iterator<Item> li(items);
+  Item *item;
   bool is_first= TRUE;
 
   str->append('<');
+  li++;  // skip result field
   while ((item= li++))
   {
     if (!is_first)
       str->append(',');
-    (*item)->print(str, query_type);
+    item->print(str, query_type);
     is_first= FALSE;
   }
   str->append('>');
diff --git a/sql/sql_expression_cache.h b/sql/sql_expression_cache.h
index 88f71e0cf32..32aecc61dc9 100644
--- a/sql/sql_expression_cache.h
+++ b/sql/sql_expression_cache.h
@@ -37,6 +37,15 @@ public:
     Print cache parameters
   */
   virtual void print(String *str, enum_query_type query_type)= 0;
+
+  /**
+    Is this cache initialized
+  */
+  virtual bool is_inited()= 0;
+  /**
+    Initialize this cache
+  */
+  virtual void init()= 0;
 };
 
 struct st_table_ref;
@@ -51,15 +60,17 @@ class Item_field;
 class Expression_cache_tmptable :public Expression_cache
 {
 public:
-  Expression_cache_tmptable(THD *thd, List<Item*> &dependants, Item *value);
+  Expression_cache_tmptable(THD *thd, List<Item> &dependants, Item *value);
   virtual ~Expression_cache_tmptable();
   virtual result check_value(Item **value);
   virtual my_bool put_value(Item *value);
 
   void print(String *str, enum_query_type query_type);
+  bool is_inited() { return inited; };
+  void init();
 
 private:
-  void init();
+  void disable_cache();
 
   /* tmp table parameters */
   TMP_TABLE_PARAM cache_table_param;
@@ -71,12 +82,12 @@ private:
   struct st_table_ref ref;
   /* Cached result */
   Item_field *cached_result;
-  /* List of references to the parameters of the expression */
-  List<Item*> *list;
-  /* List of items */
-  List<Item> items;
+  /* List of parameter items */
+  List<Item> &items;
   /* Value Item example */
   Item *val;
+  /* hit/miss counters */
+  uint hit, miss;
   /* Set on if the object has been succesfully initialized with init() */
   bool inited;
 };
diff --git a/sql/sql_handler.cc b/sql/sql_handler.cc
index 3ea378ef19c..460d2f07cfd 100644
--- a/sql/sql_handler.cc
+++ b/sql/sql_handler.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -61,12 +62,38 @@
 #include "sql_select.h"
 #include "transaction.h"
 
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
 #define HANDLER_TABLES_HASH_SIZE 120
 
 static enum enum_ha_read_modes rkey_to_rnext[]=
 { RNEXT_SAME, RNEXT, RPREV, RNEXT, RPREV, RNEXT, RPREV, RPREV };
 
 /*
+  Set handler to state after create, but keep base information about
+  which table is used
+*/
+
+void SQL_HANDLER::reset()
+{
+  fields.empty();
+  arena.free_items();
+  free_root(&mem_root, MYF(0));
+  my_free(lock);
+  init();
+}  
+  
+/* Free all allocated data */
+
+SQL_HANDLER::~SQL_HANDLER()
+{
+  reset();
+  my_free(base_data);
+}
+
+/*
   Get hash key and hash key length.
 
   SYNOPSIS
@@ -85,11 +112,11 @@ static enum enum_ha_read_modes rkey_to_rnext[]=
     Pointer to the TABLE_LIST struct.
 */
 
-static char *mysql_ha_hash_get_key(TABLE_LIST *tables, size_t *key_len_p,
+static char *mysql_ha_hash_get_key(SQL_HANDLER *table, size_t *key_len,
                                    my_bool first __attribute__((unused)))
 {
-  *key_len_p= strlen(tables->alias) + 1 ; /* include '\0' in comparisons */
-  return tables->alias;
+  *key_len= table->handler_name.length + 1 ; /* include '\0' in comparisons */
+  return table->handler_name.str;
 }
 
 
@@ -107,9 +134,9 @@ static char *mysql_ha_hash_get_key(TABLE_LIST *tables, size_t *key_len_p,
     Nothing
 */
 
-static void mysql_ha_hash_free(TABLE_LIST *tables)
+static void mysql_ha_hash_free(SQL_HANDLER *table)
 {
-  my_free(tables);
+  delete table;
 }
 
 /**
@@ -120,34 +147,43 @@ static void mysql_ha_hash_free(TABLE_LIST *tables)
 
   @note Though this function takes a list of tables, only the first list entry
   will be closed.
+  @mote handler_object is not deleted!
   @note Broadcasts refresh if it closed a table with old version.
 */
 
-static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables)
+static void mysql_ha_close_table(SQL_HANDLER *handler)
 {
+  THD *thd= handler->thd;
+  TABLE *table= handler->table;
 
-  if (tables->table && !tables->table->s->tmp_table)
+  /* check if table was already closed */
+  if (!table)
+    return;
+
+  if (!table->s->tmp_table)
   {
     /* Non temporary table. */
-    tables->table->file->ha_index_or_rnd_end();
-    tables->table->open_by_handler= 0;
-    (void) close_thread_table(thd, &tables->table);
-    thd->mdl_context.release_lock(tables->mdl_request.ticket);
+    if (handler->lock)
+    {
+      // Mark it unlocked, like in reset_lock_data()
+      reset_lock_data(handler->lock, 1);
+    }
+
+    table->file->ha_index_or_rnd_end();
+    table->open_by_handler= 0;
+    (void) close_thread_table(thd, &table);
+    thd->mdl_context.release_lock(handler->mdl_request.ticket);
   }
-  else if (tables->table)
+  else
   {
     /* Must be a temporary table */
-    TABLE *table= tables->table;
     table->file->ha_index_or_rnd_end();
     table->query_id= thd->query_id;
     table->open_by_handler= 0;
     mark_tmp_table_for_reuse(table);
   }
-
-  /* Mark table as closed, ready for re-open if necessary. */
-  tables->table= NULL;
-  /* Safety, cleanup the pointer to satisfy MDL assertions. */
-  tables->mdl_request.ticket= NULL;
+  my_free(handler->lock);
+  handler->init();
 }
 
 /*
@@ -163,7 +199,7 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables)
     Though this function takes a list of tables, only the first list entry
     will be opened.
     'reopen' is set when a handler table is to be re-opened. In this case,
-    'tables' is the pointer to the hashed TABLE_LIST object which has been
+    'tables' is the pointer to the hashed SQL_HANDLER object which has been
     saved on the original open.
     'reopen' is also used to suppress the sending of an 'ok' message.
 
@@ -172,18 +208,18 @@ static void mysql_ha_close_table(THD *thd, TABLE_LIST *tables)
     TRUE  Error
 */
 
-bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
+bool mysql_ha_open(THD *thd, TABLE_LIST *tables, SQL_HANDLER *reopen)
 {
-  TABLE_LIST    *hash_tables = NULL;
-  char          *db, *name, *alias;
-  uint          dblen, namelen, aliaslen, counter;
+  SQL_HANDLER   *sql_handler= 0;
+  uint          counter;
   bool          error;
-  TABLE         *backup_open_tables;
+  TABLE         *table, *backup_open_tables;
   MDL_savepoint mdl_savepoint;
+  Query_arena backup_arena;
   DBUG_ENTER("mysql_ha_open");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'  reopen: %d",
                       tables->db, tables->table_name, tables->alias,
-                      (int) reopen));
+                      reopen != 0));
 
   if (thd->locked_tables_mode)
   {
@@ -201,7 +237,7 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
   if (! my_hash_inited(&thd->handler_tables_hash))
   {
     /*
-      HASH entries are of type TABLE_LIST.
+      HASH entries are of type SQL_HANDLER
     */
     if (my_hash_init(&thd->handler_tables_hash, &my_charset_latin1,
                      HANDLER_TABLES_HASH_SIZE, 0, 0,
@@ -224,51 +260,6 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
     }
   }
 
-  if (! reopen)
-  {
-    /* copy the TABLE_LIST struct */
-    dblen= strlen(tables->db) + 1;
-    namelen= strlen(tables->table_name) + 1;
-    aliaslen= strlen(tables->alias) + 1;
-    if (!(my_multi_malloc(MYF(MY_WME),
-                          &hash_tables, (uint) sizeof(*hash_tables),
-                          &db, (uint) dblen,
-                          &name, (uint) namelen,
-                          &alias, (uint) aliaslen,
-                          NullS)))
-    {
-      DBUG_PRINT("exit",("ERROR"));
-      DBUG_RETURN(TRUE);
-    }
-    /* structure copy */
-    *hash_tables= *tables;
-    hash_tables->db= db;
-    hash_tables->table_name= name;
-    hash_tables->alias= alias;
-    memcpy(hash_tables->db, tables->db, dblen);
-    memcpy(hash_tables->table_name, tables->table_name, namelen);
-    memcpy(hash_tables->alias, tables->alias, aliaslen);
-    /*
-      We can't request lock with explicit duration for this table
-      right from the start as open_tables() can't handle properly
-      back-off for such locks.
-    */
-    hash_tables->mdl_request.init(MDL_key::TABLE, db, name, MDL_SHARED,
-                                  MDL_TRANSACTION);
-    /* for now HANDLER can be used only for real TABLES */
-    hash_tables->required_type= FRMTYPE_TABLE;
-
-    /* add to hash */
-    if (my_hash_insert(&thd->handler_tables_hash, (uchar*) hash_tables))
-    {
-      my_free(hash_tables);
-      DBUG_PRINT("exit",("ERROR"));
-      DBUG_RETURN(TRUE);
-    }
-  }
-  else
-    hash_tables= tables;
-
   /*
     Save and reset the open_tables list so that open_tables() won't
     be able to access (or know about) the previous list. And on return
@@ -279,62 +270,115 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
   */
   backup_open_tables= thd->open_tables;
   thd->set_open_tables(NULL);
-  mdl_savepoint= thd->mdl_context.mdl_savepoint();
 
   /*
-    open_tables() will set 'hash_tables->table' if successful.
+    open_tables() will set 'tables->table' if successful.
     It must be NULL for a real open when calling open_tables().
   */
-  DBUG_ASSERT(! hash_tables->table);
+  DBUG_ASSERT(! tables->table);
+
+  /*
+    We can't request lock with explicit duration for this table
+    right from the start as open_tables() can't handle properly
+    back-off for such locks.
+  */
+  tables->mdl_request.init(MDL_key::TABLE, tables->db, tables->table_name,
+                           MDL_SHARED, MDL_TRANSACTION);
+  mdl_savepoint= thd->mdl_context.mdl_savepoint();
+
+  /* for now HANDLER can be used only for real TABLES */
+  tables->required_type= FRMTYPE_TABLE;
 
   /*
     We use open_tables() here, rather than, say,
     open_ltable() or open_table() because we would like to be able
     to open a temporary table.
   */
-  error= open_tables(thd, &hash_tables, &counter, 0);
+  error= open_tables(thd, &tables, &counter, 0);
+
+  if (error)
+    goto err;
 
-  if (! error &&
-      ! (hash_tables->table->file->ha_table_flags() & HA_CAN_SQL_HANDLER))
+  table= tables->table;
+
+  /* There can be only one table in '*tables'. */
+  if (! (table->file->ha_table_flags() & HA_CAN_SQL_HANDLER))
   {
     my_error(ER_ILLEGAL_HA, MYF(0), tables->alias);
-    error= TRUE;
+    goto err;
   }
-  if (!error &&
-      hash_tables->mdl_request.ticket &&
-      thd->mdl_context.has_lock(mdl_savepoint,
-                                hash_tables->mdl_request.ticket))
+
+  if (tables->mdl_request.ticket &&
+      thd->mdl_context.has_lock(mdl_savepoint, tables->mdl_request.ticket))
   {
     /* The ticket returned is within a savepoint. Make a copy.  */
-    error= thd->mdl_context.clone_ticket(&hash_tables->mdl_request);
-    hash_tables->table->mdl_ticket= hash_tables->mdl_request.ticket;
+    error= thd->mdl_context.clone_ticket(&tables->mdl_request);
+    tables->table->mdl_ticket= tables->mdl_request.ticket;
+    if (error)
+      goto err;
   }
-  if (error)
+
+  if (! reopen)
   {
-    /*
-      No need to rollback statement transaction, it's not started.
-      If called with reopen flag, no need to rollback either,
-      it will be done at statement end.
-    */
-    DBUG_ASSERT(thd->transaction.stmt.is_empty());
-    close_thread_tables(thd);
-    thd->mdl_context.rollback_to_savepoint(mdl_savepoint);
-    thd->set_open_tables(backup_open_tables);
-    if (!reopen)
-      my_hash_delete(&thd->handler_tables_hash, (uchar*) hash_tables);
-    else
-    {
-      hash_tables->table= NULL;
-      /* Safety, cleanup the pointer to satisfy MDL assertions. */
-      hash_tables->mdl_request.ticket= NULL;
-    }
-    DBUG_PRINT("exit",("ERROR"));
-    DBUG_RETURN(TRUE);
+    /* copy data to sql_handler */
+    if (!(sql_handler= new SQL_HANDLER(thd)))
+      goto err;
+    init_alloc_root(&sql_handler->mem_root, 1024, 0);
+
+    sql_handler->db.length= strlen(tables->db);
+    sql_handler->table_name.length= strlen(tables->table_name);
+    sql_handler->handler_name.length= strlen(tables->alias);
+
+    if (!(my_multi_malloc(MY_WME,
+                          &sql_handler->db.str,
+                          (uint) sql_handler->db.length + 1,
+                          &sql_handler->table_name.str,
+                          (uint) sql_handler->table_name.length + 1,
+                          &sql_handler->handler_name.str,
+                          (uint) sql_handler->handler_name.length + 1,
+                          NullS)))
+      goto err;
+    sql_handler->base_data= sql_handler->db.str;  // Free this
+    memcpy(sql_handler->db.str, tables->db, sql_handler->db.length +1);
+    memcpy(sql_handler->table_name.str, tables->table_name,
+           sql_handler->table_name.length+1);
+    memcpy(sql_handler->handler_name.str, tables->alias,
+           sql_handler->handler_name.length +1);
+
+    /* add to hash */
+    if (my_hash_insert(&thd->handler_tables_hash, (uchar*) sql_handler))
+      goto err;
   }
+  else
+  {
+    sql_handler= reopen;
+    sql_handler->reset();
+  }    
+  sql_handler->table= table;
+  memcpy(&sql_handler->mdl_request, &tables->mdl_request,
+         sizeof(tables->mdl_request));
+
+  if (!(sql_handler->lock= get_lock_data(thd, &sql_handler->table, 1,
+                                         GET_LOCK_STORE_LOCKS)))
+    goto err;
+
+  /* Get a list of all fields for send_fields */
+  thd->set_n_backup_active_arena(&sql_handler->arena, &backup_arena);
+  error= table->fill_item_list(&sql_handler->fields);
+  thd->restore_active_arena(&sql_handler->arena, &backup_arena);
+
+  if (error)
+    goto err;
+
+  /* Always read all columns */
+  table->read_set= &table->s->all_set;
+  table->vcol_set= &table->s->all_set;
+
+  /* Restore the state. */
   thd->set_open_tables(backup_open_tables);
-  if (hash_tables->mdl_request.ticket)
+  if (sql_handler->mdl_request.ticket)
   {
-    thd->mdl_context.set_lock_duration(hash_tables->mdl_request.ticket,
+    thd->mdl_context.set_lock_duration(sql_handler->mdl_request.ticket,
                                        MDL_EXPLICIT);
     thd->mdl_context.set_needs_thr_lock_abort(TRUE);
   }
@@ -345,19 +389,42 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
     was opened for HANDLER as it is used to link them together
     (see thd->temporary_tables).
   */
-  DBUG_ASSERT(hash_tables->table->next == NULL ||
-              hash_tables->table->s->tmp_table);
+  DBUG_ASSERT(sql_handler->table->next == NULL ||
+              sql_handler->table->s->tmp_table);
   /*
     If it's a temp table, don't reset table->query_id as the table is
     being used by this handler. For non-temp tables we use this flag
     in asserts.
   */
-  hash_tables->table->open_by_handler= 1;
+  table->open_by_handler= 1;
+
+  /* Safety, cleanup the pointer to satisfy MDL assertions. */
+  tables->mdl_request.ticket= NULL;
 
   if (! reopen)
     my_ok(thd);
   DBUG_PRINT("exit",("OK"));
   DBUG_RETURN(FALSE);
+
+err:
+  /*
+    No need to rollback statement transaction, it's not started.
+    If called with reopen flag, no need to rollback either,
+    it will be done at statement end.
+  */
+  DBUG_ASSERT(thd->transaction.stmt.is_empty());
+  close_thread_tables(thd);
+  thd->mdl_context.rollback_to_savepoint(mdl_savepoint);
+  thd->set_open_tables(backup_open_tables);
+  if (sql_handler)
+  {
+    if (!reopen)
+      my_hash_delete(&thd->handler_tables_hash, (uchar*) sql_handler);
+    else
+      sql_handler->reset(); // or should it be init() ?
+  }
+  DBUG_PRINT("exit",("ERROR"));
+  DBUG_RETURN(TRUE);
 }
 
 
@@ -380,7 +447,7 @@ bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen)
 
 bool mysql_ha_close(THD *thd, TABLE_LIST *tables)
 {
-  TABLE_LIST    *hash_tables;
+  SQL_HANDLER *handler;
   DBUG_ENTER("mysql_ha_close");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'",
                       tables->db, tables->table_name, tables->alias));
@@ -390,12 +457,12 @@ bool mysql_ha_close(THD *thd, TABLE_LIST *tables)
     my_error(ER_LOCK_OR_ACTIVE_TRANSACTION, MYF(0));
     DBUG_RETURN(TRUE);
   }
-  if ((hash_tables= (TABLE_LIST*) my_hash_search(&thd->handler_tables_hash,
+  if ((handler= (SQL_HANDLER*) my_hash_search(&thd->handler_tables_hash,
                                                  (uchar*) tables->alias,
                                                  strlen(tables->alias) + 1)))
   {
-    mysql_ha_close_table(thd, hash_tables);
-    my_hash_delete(&thd->handler_tables_hash, (uchar*) hash_tables);
+    mysql_ha_close_table(handler);
+    my_hash_delete(&thd->handler_tables_hash, (uchar*) handler);
   }
   else
   {
@@ -467,6 +534,167 @@ handle_condition(THD *thd,
 }
 
 
+/**
+   Finds an open HANDLER table.
+
+   @params name		Name of handler to open
+
+   @return 0 failure
+   @return handler
+*/  
+
+SQL_HANDLER *mysql_ha_find_handler(THD *thd, const char *name)
+{
+  SQL_HANDLER *handler;
+  if ((handler= (SQL_HANDLER*) my_hash_search(&thd->handler_tables_hash,
+                                              (uchar*) name, strlen(name) + 1)))
+  {
+    DBUG_PRINT("info-in-hash",("'%s'.'%s' as '%s' table: %p",
+                               handler->db.str,
+                               handler->table_name.str,
+                               handler->handler_name.str, handler->table));
+    if (!handler->table)
+    {
+      /* The handler table has been closed. Re-open it. */
+      TABLE_LIST tmp;
+      tmp.init_one_table(handler->db.str, handler->db.length,
+                         handler->table_name.str, handler->table_name.length,
+                         handler->handler_name.str, TL_READ);
+
+      if (mysql_ha_open(thd, &tmp, handler))
+      {
+        DBUG_PRINT("exit",("reopen failed"));
+        return 0;
+      }
+    }
+  }
+  else
+  {
+    my_error(ER_UNKNOWN_TABLE, MYF(0), name, "HANDLER");
+    return 0;
+  }
+  return handler;
+}
+
+
+/**
+   Check that condition and key name are ok
+
+   @param handler
+   @param mode		Read mode (RFIRST, RNEXT etc...)
+   @param keyname	Key to use.
+   @param key_expr      List of key column values
+   @param cond		Where clause
+   @param in_prepare	If we are in prepare phase (we can't evalute items yet)
+
+   @return 0 ok
+   @return 1 error
+
+   In ok, then values of used key and mode is stored in sql_handler
+*/
+
+static bool
+mysql_ha_fix_cond_and_key(SQL_HANDLER *handler, 
+                          enum enum_ha_read_modes mode, char *keyname,
+                          List<Item> *key_expr,
+                          Item *cond, bool in_prepare)
+{
+  THD *thd= handler->thd;
+  TABLE *table= handler->table;
+  if (cond)
+  {
+    /* This can only be true for temp tables */
+    if (table->query_id != thd->query_id)
+      cond->cleanup();                          // File was reopened
+    if ((!cond->fixed &&
+	 cond->fix_fields(thd, &cond)) || cond->check_cols(1))
+      return 1;
+  }
+
+  if (keyname)
+  {
+    /* Check if same as last keyname. If not, do a full lookup */
+    if (handler->keyno < 0 ||
+        my_strcasecmp(&my_charset_latin1,
+                      keyname,
+                      table->s->key_info[handler->keyno].name))
+    {
+      if ((handler->keyno= find_type(keyname, &table->s->keynames,
+                                     FIND_TYPE_NO_PREFIX) - 1) < 0)
+      {
+        my_error(ER_KEY_DOES_NOT_EXITS, MYF(0), keyname,
+                 handler->handler_name.str);
+        return 1;
+      }
+    }
+
+    /* Check key parts */
+    if (mode == RKEY)
+    {
+      TABLE *table= handler->table;
+      KEY *keyinfo= table->key_info + handler->keyno;
+      KEY_PART_INFO *key_part= keyinfo->key_part;
+      List_iterator<Item> it_ke(*key_expr);
+      Item *item;
+      key_part_map keypart_map;
+      uint key_len;
+
+      if (key_expr->elements > keyinfo->key_parts)
+      {
+        my_error(ER_TOO_MANY_KEY_PARTS, MYF(0), keyinfo->key_parts);
+        return 1;
+      }
+      for (keypart_map= key_len=0 ; (item=it_ke++) ; key_part++)
+      {
+        my_bitmap_map *old_map;
+	/* note that 'item' can be changed by fix_fields() call */
+        if ((!item->fixed &&
+             item->fix_fields(thd, it_ke.ref())) ||
+	    (item= *it_ke.ref())->check_cols(1))
+          return 1;
+	if (item->used_tables() & ~(RAND_TABLE_BIT | PARAM_TABLE_BIT))
+        {
+          my_error(ER_WRONG_ARGUMENTS,MYF(0),"HANDLER ... READ");
+	  return 1;
+        }
+        if (!in_prepare)
+        {
+          old_map= dbug_tmp_use_all_columns(table, table->write_set);
+          (void) item->save_in_field(key_part->field, 1);
+          dbug_tmp_restore_column_map(table->write_set, old_map);
+        }
+        key_len+= key_part->store_length;
+        keypart_map= (keypart_map << 1) | 1;
+      }
+      handler->keypart_map= keypart_map;
+      handler->key_len= key_len;
+    }
+    else
+    {
+      /*
+        Check if the same index involved.
+        We need to always do this check because we may not have yet
+        called the handler since the last keyno change.
+      */
+      if ((uint) handler->keyno != table->file->get_index())
+      {
+        if (mode == RNEXT)
+          mode= RFIRST;
+        else if (mode == RPREV)
+          mode= RLAST;
+      }
+    }
+  }
+  else if (table->file->inited != handler::RND)
+  {
+    /* Convert RNEXT to RFIRST if we haven't started row scan */
+    if (mode == RNEXT)
+      mode= RFIRST;
+  }
+  handler->mode= mode;                          // Store adjusted mode
+  return 0;
+}
+
 /*
   Read from a HANDLER table.
 
@@ -493,17 +721,14 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables,
                    enum ha_rkey_function ha_rkey_mode, Item *cond,
                    ha_rows select_limit_cnt, ha_rows offset_limit_cnt)
 {
-  TABLE_LIST    *hash_tables;
-  TABLE         *table, *backup_open_tables;
-  MYSQL_LOCK    *lock;
-  List<Item>	list;
+  SQL_HANDLER   *handler;
+  TABLE         *table;
   Protocol	*protocol= thd->protocol;
   char		buff[MAX_FIELD_WIDTH];
   String	buffer(buff, sizeof(buff), system_charset_info);
-  int           error, keyno= -1;
+  int           error, keyno;
   uint          num_rows;
   uchar		*UNINIT_VAR(key);
-  uint		UNINIT_VAR(key_len);
   Sql_handler_lock_error_handler sql_handler_lock_error;
   DBUG_ENTER("mysql_ha_read");
   DBUG_PRINT("enter",("'%s'.'%s' as '%s'",
@@ -515,125 +740,81 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables,
     DBUG_RETURN(TRUE);
   }
 
-  thd->lex->select_lex.context.resolve_in_table_list_only(tables);
-  list.push_front(new Item_field(&thd->lex->select_lex.context,
-                                 NULL, NULL, "*"));
-  List_iterator<Item> it(list);
-  it++;
-
 retry:
-  if ((hash_tables= (TABLE_LIST*) my_hash_search(&thd->handler_tables_hash,
-                                                 (uchar*) tables->alias,
-                                                 strlen(tables->alias) + 1)))
-  {
-    table= hash_tables->table;
-    DBUG_PRINT("info-in-hash",("'%s'.'%s' as '%s' table: 0x%lx",
-                               hash_tables->db, hash_tables->table_name,
-                               hash_tables->alias, (long) table));
-    if (!table)
-    {
-      /*
-        The handler table has been closed. Re-open it.
-      */
-      if (mysql_ha_open(thd, hash_tables, 1))
-      {
-        DBUG_PRINT("exit",("reopen failed"));
-        goto err0;
-      }
+  if (!(handler= mysql_ha_find_handler(thd, tables->alias)))
+    goto err0;
 
-      table= hash_tables->table;
-      DBUG_PRINT("info",("re-opened '%s'.'%s' as '%s' tab %p",
-                         hash_tables->db, hash_tables->table_name,
-                         hash_tables->alias, table));
-    }
-  }
-  else
-    table= NULL;
+  table= handler->table;
+  tables->table= table;                         // This is used by fix_fields
+  table->pos_in_table_list= tables;
 
-  if (!table)
+  if (handler->lock->lock_count > 0)
   {
-    my_error(ER_UNKNOWN_TABLE, MYF(0), tables->alias, "HANDLER");
-    goto err0;
-  }
+    bool lock_error;
 
-  /* save open_tables state */
-  backup_open_tables= thd->open_tables;
-  /* Always a one-element list, see mysql_ha_open(). */
-  DBUG_ASSERT(hash_tables->table->next == NULL ||
-              hash_tables->table->s->tmp_table);
-  /*
-    mysql_lock_tables() needs thd->open_tables to be set correctly to
-    be able to handle aborts properly.
-  */
-  thd->set_open_tables(hash_tables->table);
+    handler->lock->locks[0]->type= handler->lock->locks[0]->org_type;
 
+    /* save open_tables state */
+    TABLE* backup_open_tables= thd->open_tables;
+    /* Always a one-element list, see mysql_ha_open(). */
+    DBUG_ASSERT(table->next == NULL || table->s->tmp_table);
+    /*
+      mysql_lock_tables() needs thd->open_tables to be set correctly to
+      be able to handle aborts properly.
+    */
+    thd->set_open_tables(table);
 
-  sql_handler_lock_error.init();
-  thd->push_internal_handler(&sql_handler_lock_error);
+    sql_handler_lock_error.init();
+    thd->push_internal_handler(&sql_handler_lock_error);
 
-  lock= mysql_lock_tables(thd, &thd->open_tables, 1, 0);
+    lock_error= mysql_lock_tables(thd, handler->lock,
+                                  (table->s->tmp_table == NO_TMP_TABLE ?
+                                    MYSQL_LOCK_NOT_TEMPORARY : 0));
 
-  thd->pop_internal_handler();
-  /*
-    In 5.1 and earlier, mysql_lock_tables() could replace the TABLE
-    object with another one (reopen it). This is no longer the case
-    with new MDL.
-  */
-  DBUG_ASSERT(hash_tables->table == thd->open_tables);
-  /* Restore previous context. */
-  thd->set_open_tables(backup_open_tables);
+    thd->pop_internal_handler();
 
-  if (sql_handler_lock_error.need_reopen())
-  {
-    DBUG_ASSERT(!lock && !thd->is_error());
     /*
-      Always close statement transaction explicitly,
-      so that the engine doesn't have to count locks.
+      In 5.1 and earlier, mysql_lock_tables() could replace the TABLE
+      object with another one (reopen it). This is no longer the case
+      with new MDL.
     */
-    trans_rollback_stmt(thd);
-    mysql_ha_close_table(thd, hash_tables);
-    goto retry;
-  }
-
-  if (!lock)
-    goto err0; // mysql_lock_tables() printed error message already
-
-  // Always read all columns
-  hash_tables->table->read_set= &hash_tables->table->s->all_set;
-  tables->table= hash_tables->table;
-
-  if (cond)
-  {
-    if (table->query_id != thd->query_id)
-      cond->cleanup();                          // File was reopened
-    if ((!cond->fixed &&
-	 cond->fix_fields(thd, &cond)) || cond->check_cols(1))
-      goto err;
-  }
+    DBUG_ASSERT(table == thd->open_tables);
+    /* Restore previous context. */
+    thd->set_open_tables(backup_open_tables);
 
-  if (keyname)
-  {
-    if ((keyno= find_type(keyname, &table->s->keynames,
-                          FIND_TYPE_NO_PREFIX) - 1) < 0)
+    if (sql_handler_lock_error.need_reopen())
     {
-      my_error(ER_KEY_DOES_NOT_EXITS, MYF(0), keyname, tables->alias);
-      goto err;
-    }
-    /* Check if the same index involved. */
-    if ((uint) keyno != table->file->get_index())
-    {
-      if (mode == RNEXT)
-        mode= RFIRST;
-      else if (mode == RPREV)
-        mode= RLAST;
+      DBUG_ASSERT(lock_error && !thd->is_error());
+      /*
+        Always close statement transaction explicitly,
+        so that the engine doesn't have to count locks.
+      */
+      trans_rollback_stmt(thd);
+      mysql_ha_close_table(handler);
+      if (thd->stmt_arena->is_stmt_execute())
+      {
+        /*
+          As we have already sent field list and types to the client, we can't
+          handle any changes in the table format for prepared statements.
+          Better to force a reprepare.
+        */
+        my_error(ER_NEED_REPREPARE, MYF(0));
+        goto err0;
+      }
+      goto retry;
     }
+
+    if (lock_error)
+      goto err0; // mysql_lock_tables() printed error message already
   }
 
-  if (insert_fields(thd, &thd->lex->select_lex.context,
-                    tables->db, tables->alias, &it, 0))
+  if (mysql_ha_fix_cond_and_key(handler, mode, keyname, key_expr, cond, 0))
     goto err;
+  mode= handler->mode;
+  keyno= handler->keyno;
 
-  protocol->send_result_set_metadata(&list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF);
+  protocol->send_result_set_metadata(&handler->fields,
+                                Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF);
 
   /*
     In ::external_lock InnoDB resets the fields which tell it that
@@ -649,6 +830,8 @@ retry:
     case RNEXT:
       if (table->file->inited != handler::NONE)
       {
+        if ((error= table->file->can_continue_handler_scan()))
+          break;
         if (keyname)
         {
           /* Check if we read from the same index. */
@@ -656,9 +839,7 @@ retry:
           error= table->file->ha_index_next(table->record[0]);
         }
         else
-        {
           error= table->file->ha_rnd_next(table->record[0]);
-        }
         break;
       }
       /* else fall through */
@@ -675,7 +856,7 @@ retry:
 	if (!(error= table->file->ha_rnd_init(1)))
           error= table->file->ha_rnd_next(table->record[0]);
       }
-      mode=RNEXT;
+      mode= RNEXT;
       break;
     case RPREV:
       DBUG_ASSERT(keyname != 0);
@@ -683,7 +864,9 @@ retry:
       DBUG_ASSERT((uint) keyno == table->file->get_index());
       if (table->file->inited != handler::NONE)
       {
-        error=table->file->ha_index_prev(table->record[0]);
+        if ((error= table->file->can_continue_handler_scan()))
+          break;
+        error= table->file->ha_index_prev(table->record[0]);
         break;
       }
       /* else fall through */
@@ -692,54 +875,28 @@ retry:
       table->file->ha_index_or_rnd_end();
       table->file->ha_index_init(keyno, 1);
       error= table->file->ha_index_last(table->record[0]);
-      mode=RPREV;
+      mode= RPREV;
       break;
     case RNEXT_SAME:
       /* Continue scan on "(keypart1,keypart2,...)=(c1, c2, ...)  */
       DBUG_ASSERT(keyname != 0);
-      error= table->file->ha_index_next_same(table->record[0], key, key_len);
+      error= table->file->ha_index_next_same(table->record[0], key,
+                                             handler->key_len);
       break;
     case RKEY:
     {
       DBUG_ASSERT(keyname != 0);
-      KEY *keyinfo=table->key_info+keyno;
-      KEY_PART_INFO *key_part=keyinfo->key_part;
-      if (key_expr->elements > keyinfo->key_parts)
-      {
-	my_error(ER_TOO_MANY_KEY_PARTS, MYF(0), keyinfo->key_parts);
-	goto err;
-      }
-      List_iterator<Item> it_ke(*key_expr);
-      Item *item;
-      key_part_map keypart_map;
-      for (keypart_map= key_len=0 ; (item=it_ke++) ; key_part++)
-      {
-        my_bitmap_map *old_map;
-	// 'item' can be changed by fix_fields() call
-        if ((!item->fixed &&
-             item->fix_fields(thd, it_ke.ref())) ||
-	    (item= *it_ke.ref())->check_cols(1))
-	  goto err;
-	if (item->used_tables() & ~RAND_TABLE_BIT)
-        {
-          my_error(ER_WRONG_ARGUMENTS,MYF(0),"HANDLER ... READ");
-	  goto err;
-        }
-        old_map= dbug_tmp_use_all_columns(table, table->write_set);
-	(void) item->save_in_field(key_part->field, 1);
-        dbug_tmp_restore_column_map(table->write_set, old_map);
-	key_len+=key_part->store_length;
-        keypart_map= (keypart_map << 1) | 1;
-      }
 
-      if (!(key= (uchar*) thd->calloc(ALIGN_SIZE(key_len))))
+      if (!(key= (uchar*) thd->calloc(ALIGN_SIZE(handler->key_len))))
 	goto err;
       table->file->ha_index_or_rnd_end();
       table->file->ha_index_init(keyno, 1);
-      key_copy(key, table->record[0], table->key_info + keyno, key_len);
+      key_copy(key, table->record[0], table->key_info + keyno,
+               handler->key_len);
       error= table->file->ha_index_read_map(table->record[0],
-                                            key, keypart_map, ha_rkey_mode);
-      mode=rkey_to_rnext[(int)ha_rkey_mode];
+                                            key, handler->keypart_map,
+                                            ha_rkey_mode);
+      mode= rkey_to_rnext[(int)ha_rkey_mode];
       break;
     }
     default:
@@ -753,9 +910,13 @@ retry:
         continue;
       if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
       {
-        sql_print_error("mysql_ha_read: Got error %d when reading table '%s'",
-                        error, tables->table_name);
+        /* Don't give error in the log file for some expected problems */
+        if (error != HA_ERR_RECORD_CHANGED && error != HA_ERR_WRONG_COMMAND)
+          sql_print_error("mysql_ha_read: Got error %d when reading "
+                          "table '%s'",
+                          error, tables->table_name);
         table->file->print_error(error,MYF(0));
+        table->file->ha_index_or_rnd_end();
         goto err;
       }
       goto ok;
@@ -772,7 +933,7 @@ retry:
     {
       protocol->prepare_for_resend();
 
-      if (protocol->send_result_set_row(&list))
+      if (protocol->send_result_set_row(&handler->fields))
         goto err;
 
       protocol->write();
@@ -785,14 +946,14 @@ ok:
     so that the engine doesn't have to count locks.
   */
   trans_commit_stmt(thd);
-  mysql_unlock_tables(thd,lock);
+  mysql_unlock_tables(thd, handler->lock, 0);
   my_eof(thd);
   DBUG_PRINT("exit",("OK"));
   DBUG_RETURN(FALSE);
 
 err:
   trans_rollback_stmt(thd);
-  mysql_unlock_tables(thd,lock);
+  mysql_unlock_tables(thd, handler->lock, 0);
 err0:
   DBUG_PRINT("exit",("ERROR"));
   DBUG_RETURN(TRUE);
@@ -800,6 +961,28 @@ err0:
 
 
 /**
+   Prepare for handler read
+
+   For parameters, see mysql_ha_read()
+*/
+
+SQL_HANDLER *mysql_ha_read_prepare(THD *thd, TABLE_LIST *tables,
+                                   enum enum_ha_read_modes mode, char *keyname,
+                                   List<Item> *key_expr, Item *cond)
+{
+  SQL_HANDLER *handler;
+  DBUG_ENTER("mysql_ha_read_prepare");
+  if (!(handler= mysql_ha_find_handler(thd, tables->alias)))
+    DBUG_RETURN(0);
+  tables->table= handler->table;         // This is used by fix_fields
+  if (mysql_ha_fix_cond_and_key(handler, mode, keyname, key_expr, cond, 1))
+    DBUG_RETURN(0);
+  DBUG_RETURN(handler);
+}
+
+  
+
+/**
   Scan the handler tables hash for matching tables.
 
   @param thd Thread identifier.
@@ -810,30 +993,32 @@ err0:
           table was matched.
 */
 
-static TABLE_LIST *mysql_ha_find(THD *thd, TABLE_LIST *tables)
+static SQL_HANDLER *mysql_ha_find_match(THD *thd, TABLE_LIST *tables)
 {
-  TABLE_LIST *hash_tables, *head= NULL, *first= tables;
-  DBUG_ENTER("mysql_ha_find");
+  SQL_HANDLER *hash_tables, *head= NULL;
+  TABLE_LIST *first= tables;
+  DBUG_ENTER("mysql_ha_find_match");
 
   /* search for all handlers with matching table names */
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) my_hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) my_hash_element(&thd->handler_tables_hash, i);
+
     for (tables= first; tables; tables= tables->next_local)
     {
       if ((! *tables->db ||
-          ! my_strcasecmp(&my_charset_latin1, hash_tables->db, tables->db)) &&
-          ! my_strcasecmp(&my_charset_latin1, hash_tables->table_name,
+          ! my_strcasecmp(&my_charset_latin1, hash_tables->db.str,
+                          tables->db)) &&
+          ! my_strcasecmp(&my_charset_latin1, hash_tables->table_name.str,
                           tables->table_name))
+      {
+        /* Link into hash_tables list */
+        hash_tables->next= head;
+        head= hash_tables;
         break;
-    }
-    if (tables)
-    {
-      hash_tables->next_local= head;
-      head= hash_tables;
+      }
     }
   }
-
   DBUG_RETURN(head);
 }
 
@@ -849,18 +1034,18 @@ static TABLE_LIST *mysql_ha_find(THD *thd, TABLE_LIST *tables)
 
 void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables)
 {
-  TABLE_LIST *hash_tables, *next;
+  SQL_HANDLER *hash_tables, *next;
   DBUG_ENTER("mysql_ha_rm_tables");
 
   DBUG_ASSERT(tables);
 
-  hash_tables= mysql_ha_find(thd, tables);
+  hash_tables= mysql_ha_find_match(thd, tables);
 
   while (hash_tables)
   {
-    next= hash_tables->next_local;
+    next= hash_tables->next;
     if (hash_tables->table)
-      mysql_ha_close_table(thd, hash_tables);
+      mysql_ha_close_table(hash_tables);
     my_hash_delete(&thd->handler_tables_hash, (uchar*) hash_tables);
     hash_tables= next;
   }
@@ -890,13 +1075,13 @@ void mysql_ha_flush_tables(THD *thd, TABLE_LIST *all_tables)
   for (TABLE_LIST *table_list= all_tables; table_list;
        table_list= table_list->next_global)
   {
-    TABLE_LIST *hash_tables= mysql_ha_find(thd, table_list);
+    SQL_HANDLER *hash_tables= mysql_ha_find_match(thd, table_list);
     /* Close all aliases of the same table. */
     while (hash_tables)
     {
-      TABLE_LIST *next_local= hash_tables->next_local;
+      SQL_HANDLER *next_local= hash_tables->next;
       if (hash_tables->table)
-        mysql_ha_close_table(thd, hash_tables);
+        mysql_ha_close_table(hash_tables);
       hash_tables= next_local;
     }
   }
@@ -916,7 +1101,7 @@ void mysql_ha_flush_tables(THD *thd, TABLE_LIST *all_tables)
 
 void mysql_ha_flush(THD *thd)
 {
-  TABLE_LIST *hash_tables;
+  SQL_HANDLER *hash_tables;
   DBUG_ENTER("mysql_ha_flush");
 
   mysql_mutex_assert_not_owner(&LOCK_open);
@@ -931,7 +1116,7 @@ void mysql_ha_flush(THD *thd)
 
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) my_hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) my_hash_element(&thd->handler_tables_hash, i);
     /*
       TABLE::mdl_ticket is 0 for temporary tables so we need extra check.
     */
@@ -940,7 +1125,7 @@ void mysql_ha_flush(THD *thd)
          hash_tables->table->mdl_ticket->has_pending_conflicting_lock()) ||
          (!hash_tables->table->s->tmp_table &&
           hash_tables->table->s->has_old_version())))
-      mysql_ha_close_table(thd, hash_tables);
+      mysql_ha_close_table(hash_tables);
   }
 
   DBUG_VOID_RETURN;
@@ -957,14 +1142,14 @@ void mysql_ha_flush(THD *thd)
 
 void mysql_ha_cleanup(THD *thd)
 {
-  TABLE_LIST *hash_tables;
+  SQL_HANDLER *hash_tables;
   DBUG_ENTER("mysql_ha_cleanup");
 
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) my_hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) my_hash_element(&thd->handler_tables_hash, i);
     if (hash_tables->table)
-      mysql_ha_close_table(thd, hash_tables);
+      mysql_ha_close_table(hash_tables);
   }
 
   my_hash_free(&thd->handler_tables_hash);
@@ -982,12 +1167,12 @@ void mysql_ha_cleanup(THD *thd)
 
 void mysql_ha_set_explicit_lock_duration(THD *thd)
 {
-  TABLE_LIST *hash_tables;
+  SQL_HANDLER *hash_tables;
   DBUG_ENTER("mysql_ha_set_explicit_lock_duration");
 
   for (uint i= 0; i < thd->handler_tables_hash.records; i++)
   {
-    hash_tables= (TABLE_LIST*) my_hash_element(&thd->handler_tables_hash, i);
+    hash_tables= (SQL_HANDLER*) my_hash_element(&thd->handler_tables_hash, i);
     if (hash_tables->table && hash_tables->table->mdl_ticket)
       thd->mdl_context.set_lock_duration(hash_tables->table->mdl_ticket,
                                          MDL_EXPLICIT);
diff --git a/sql/sql_handler.h b/sql/sql_handler.h
index 2eea552d7c9..133f553675e 100644
--- a/sql/sql_handler.h
+++ b/sql/sql_handler.h
@@ -1,5 +1,6 @@
-/* Copyright 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
-
+#ifndef SQL_HANDLER_INCLUDED
+#define SQL_HANDLER_INCLUDED
+/* Copyright (C) 2010 Monty Program Ab
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; version 2 of the License.
@@ -13,17 +14,57 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA */
 
-#ifndef SQL_HANDLER_INCLUDED
-#define SQL_HANDLER_INCLUDED
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
 
 #include "sql_class.h"                 /* enum_ha_read_mode */
 #include "my_base.h"                   /* ha_rkey_function, ha_rows */
 #include "sql_list.h"                  /* List */
 
+/* Open handlers are stored here */
+
+class SQL_HANDLER {
+public:
+  TABLE *table;
+  List<Item> fields;                            /* Fields, set on open */
+  THD *thd;
+  LEX_STRING handler_name;
+  LEX_STRING db;
+  LEX_STRING table_name;
+  MEM_ROOT mem_root;
+  MYSQL_LOCK *lock;
+  MDL_request mdl_request;
+
+  key_part_map keypart_map;
+  int keyno;                                    /* Used key */
+  uint key_len;
+  enum enum_ha_read_modes mode;
+
+  /* This is only used when deleting many handler objects */
+  SQL_HANDLER *next;
+
+  Query_arena arena;
+  char *base_data;
+  SQL_HANDLER(THD *thd_arg) :
+    thd(thd_arg), arena(&mem_root, Query_arena::STMT_INITIALIZED)
+  { init(); clear_alloc_root(&mem_root); base_data= 0; }
+  void init()
+  {
+    keyno= -1;
+    table= 0;
+    lock= 0;
+    mdl_request.ticket= 0;
+  }
+  void reset();
+
+  ~SQL_HANDLER();
+};
+
 class THD;
 struct TABLE_LIST;
 
-bool mysql_ha_open(THD *thd, TABLE_LIST *tables, bool reopen);
+bool mysql_ha_open(THD *thd, TABLE_LIST *tables, SQL_HANDLER *reopen);
 bool mysql_ha_close(THD *thd, TABLE_LIST *tables);
 bool mysql_ha_read(THD *, TABLE_LIST *,enum enum_ha_read_modes,char *,
                    List<Item> *,enum ha_rkey_function,Item *,ha_rows,ha_rows);
@@ -33,4 +74,7 @@ void mysql_ha_rm_tables(THD *thd, TABLE_LIST *tables);
 void mysql_ha_cleanup(THD *thd);
 void mysql_ha_set_explicit_lock_duration(THD *thd);
 
-#endif /* SQL_HANDLER_INCLUDED */
+SQL_HANDLER *mysql_ha_read_prepare(THD *thd, TABLE_LIST *tables,
+                                   enum enum_ha_read_modes mode, char *keyname,
+                                   List<Item> *key_expr, Item *cond);
+#endif
diff --git a/sql/sql_help.cc b/sql/sql_help.cc
index 9b2295f6fb3..56a16d9b668 100644
--- a/sql/sql_help.cc
+++ b/sql/sql_help.cc
@@ -645,7 +645,7 @@ bool mysqld_help(THD *thd, const char *mask)
   Protocol *protocol= thd->protocol;
   SQL_SELECT *select;
   st_find_field used_fields[array_elements(init_used_fields)];
-  TABLE_LIST *leaves= 0;
+  List<TABLE_LIST> leaves;
   TABLE_LIST tables[4];
   List<String> topics_list, categories_list, subcategories_list;
   String name, description, example;
@@ -691,7 +691,7 @@ bool mysqld_help(THD *thd, const char *mask)
     thd->lex->select_lex.context.first_name_resolution_table= &tables[0];
   if (setup_tables(thd, &thd->lex->select_lex.context,
                    &thd->lex->select_lex.top_join_list,
-                   tables, &leaves, FALSE))
+                   tables, leaves, FALSE, FALSE))
     goto error;
   memcpy((char*) used_fields, (char*) init_used_fields, sizeof(used_fields));
   if (init_fields(thd, tables, used_fields, array_elements(used_fields)))
diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc
index d807ac36278..a1c016b204e 100644
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@ -74,6 +74,7 @@
 #include "rpl_mi.h"
 #include "transaction.h"
 #include "sql_audit.h"
+#include "sql_derived.h"                        // mysql_handle_derived
 
 #ifndef EMBEDDED_LIBRARY
 static bool delayed_get_table(THD *thd, MDL_request *grl_protection_request,
@@ -128,7 +129,7 @@ bool check_view_single_update(List<Item> &fields, List<Item> *values,
   {
     it.init(*values);
     while ((item= it++))
-      tables|= item->used_tables();
+      tables|= item->view_used_tables(view);
   }
 
   /* Convert to real table bits */
@@ -145,6 +146,11 @@ bool check_view_single_update(List<Item> &fields, List<Item> *values,
   if (view->check_single_table(&tbl, tables, view) || tbl == 0)
     goto error;
 
+  /*
+    A buffer for the insert values was allocated for the merged view.
+    Use it.
+  */
+  tbl->table->insert_values= view->table->insert_values;
   view->table= tbl->table;
   *map= tables;
 
@@ -248,6 +254,10 @@ static int check_insert_fields(THD *thd, TABLE_LIST *table_list,
     */
     table_list->next_local= 0;
     context->resolve_in_table_list_only(table_list);
+    /* 'Unfix' fields to allow correct marking by the setup_fields function. */
+    if (table_list->is_view())
+      unfix_fields(fields);
+
     res= setup_fields(thd, 0, fields, MARK_COLUMNS_WRITE, 0, 0);
 
     /* Restore the current context. */
@@ -257,7 +267,7 @@ static int check_insert_fields(THD *thd, TABLE_LIST *table_list,
     if (res)
       return -1;
 
-    if (table_list->effective_algorithm == VIEW_ALGORITHM_MERGE)
+    if (table_list->is_view() && table_list->is_merged_derived())
     {
       if (check_view_single_update(fields,
                                    fields_and_values_from_different_maps ?
@@ -346,7 +356,8 @@ static int check_update_fields(THD *thd, TABLE_LIST *insert_table_list,
   if (setup_fields(thd, 0, update_fields, MARK_COLUMNS_WRITE, 0, 0))
     return -1;
 
-  if (insert_table_list->effective_algorithm == VIEW_ALGORITHM_MERGE &&
+  if (insert_table_list->is_view() &&
+      insert_table_list->is_merged_derived() &&
       check_view_single_update(update_fields, &update_values,
                                insert_table_list, map))
     return -1;
@@ -688,8 +699,7 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
   /*
     We can't write-delayed into a table locked with LOCK TABLES:
     this will lead to a deadlock, since the delayed thread will
-    never be able to get a lock on the table. QQQ: why not
-    upgrade the lock here instead?
+    never be able to get a lock on the table.
   */
   if (table_list->lock_type == TL_WRITE_DELAYED &&
       thd->locked_tables_mode &&
@@ -700,6 +710,12 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
              table_list->table_name);
     DBUG_RETURN(TRUE);
   }
+  /*
+    mark the table_list as a target for insert, to skip the DT/view prepare phase 
+    for correct access rights checks
+    TODO: remove this hack
+  */
+  table_list->skip_prepare_derived= TRUE;
 
   if (table_list->lock_type == TL_WRITE_DELAYED)
   {
@@ -711,6 +727,7 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
     if (open_and_lock_tables(thd, table_list, TRUE, 0))
       DBUG_RETURN(TRUE);
   }
+
   lock_type= table_list->lock_type;
 
   thd_proc_info(thd, "init");
@@ -884,7 +901,12 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
           be overwritten by fill_record() anyway (and fill_record() does not
           use default values in this case).
         */
-        table->record[0][0]= share->default_values[0];
+#ifdef HAVE_valgrind
+        if (table->file->ha_table_flags() && HA_RECORD_MUST_BE_CLEAN_ON_WRITE)
+          restore_record(table,s->default_values);	// Get empty record
+        else
+#endif
+          table->record[0][0]= share->default_values[0];
 
         /* Fix undefined null_bits. */
         if (share->null_bytes > 1 && share->last_null_bit_pos)
@@ -1095,6 +1117,12 @@ bool mysql_insert(THD *thd,TABLE_LIST *table_list,
     ::my_ok(thd, info.copied + info.deleted + updated, id, buff);
   }
   thd->abort_on_warning= 0;
+  if (thd->lex->current_select->first_cond_optimization)
+  {
+    thd->lex->current_select->save_leaf_tables(thd);
+    thd->lex->current_select->first_cond_optimization= 0;
+  }
+  
   DBUG_RETURN(FALSE);
 
 abort:
@@ -1223,6 +1251,11 @@ static bool mysql_prepare_insert_check_table(THD *thd, TABLE_LIST *table_list,
   bool insert_into_view= (table_list->view != 0);
   DBUG_ENTER("mysql_prepare_insert_check_table");
 
+  if (!table_list->updatable)
+  {
+    my_error(ER_NON_INSERTABLE_TABLE, MYF(0), table_list->alias, "INSERT");
+    DBUG_RETURN(TRUE);
+  }
   /*
      first table in list is the one we'll INSERT into, requires INSERT_ACL.
      all others require SELECT_ACL only. the ACL requirement below is for
@@ -1233,14 +1266,16 @@ static bool mysql_prepare_insert_check_table(THD *thd, TABLE_LIST *table_list,
   if (setup_tables_and_check_access(thd, &thd->lex->select_lex.context,
                                     &thd->lex->select_lex.top_join_list,
                                     table_list,
-                                    &thd->lex->select_lex.leaf_tables,
-                                    select_insert, INSERT_ACL, SELECT_ACL))
+                                    thd->lex->select_lex.leaf_tables,
+                                    select_insert, INSERT_ACL, SELECT_ACL,
+                                    TRUE))
     DBUG_RETURN(TRUE);
 
   if (insert_into_view && !fields.elements)
   {
     thd->lex->empty_field_list_on_rset= 1;
-    if (!table_list->table)
+    if (!thd->lex->select_lex.leaf_tables.head()->table ||
+        table_list->is_multitable())
     {
       my_error(ER_VIEW_NO_INSERT_FIELD_LIST, MYF(0),
                table_list->view_db.str, table_list->view_name.str);
@@ -1331,6 +1366,12 @@ bool mysql_prepare_insert(THD *thd, TABLE_LIST *table_list,
   /* INSERT should have a SELECT or VALUES clause */
   DBUG_ASSERT (!select_insert || !values);
 
+  if (mysql_handle_derived(thd->lex, DT_INIT))
+    DBUG_RETURN(TRUE); 
+  if (table_list->handle_derived(thd->lex, DT_MERGE_FOR_INSERT))
+    DBUG_RETURN(TRUE); 
+  if (mysql_handle_list_of_derived(thd->lex, table_list, DT_PREPARE))
+    DBUG_RETURN(TRUE); 
   /*
     For subqueries in VALUES() we should not see the table in which we are
     inserting (for INSERT ... SELECT this is done by changing table_list,
@@ -1626,7 +1667,7 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info)
           table->file->adjust_next_insert_id_after_explicit_value(
             table->next_number_field->val_int());
         info->touched++;
-        if (!records_are_comparable(table) || compare_records(table))
+        if (!records_are_comparable(table) || compare_record(table))
         {
           if ((error=table->file->ha_update_row(table->record[1],
                                                 table->record[0])) &&
@@ -1837,10 +1878,11 @@ class delayed_row :public ilink {
 public:
   char *record;
   enum_duplicates dup;
-  time_t start_time;
+  my_time_t start_time;
+  ulong start_time_sec_part;
   ulong sql_mode;
   bool auto_increment_field_not_null;
-  bool query_start_used, ignore, log_query;
+  bool query_start_used, ignore, log_query, query_start_sec_part_used;
   bool stmt_depends_on_first_successful_insert_id_in_prev_stmt;
   ulonglong first_successful_insert_id_in_prev_stmt;
   ulonglong forced_insert_id;
@@ -2094,6 +2136,11 @@ bool delayed_get_table(THD *thd, MDL_request *grl_protection_request,
       mysql_mutex_lock(&LOCK_thread_count);
       thread_count++;
       mysql_mutex_unlock(&LOCK_thread_count);
+      /*
+        Annotating delayed inserts is not supported.
+      */
+      di->thd.variables.binlog_annotate_rows_events= 0;
+
       di->thd.set_db(table_list->db, (uint) strlen(table_list->db));
       di->thd.set_query(my_strdup(table_list->table_name,
                                   MYF(MY_WME | ME_FATALERROR)),
@@ -2218,6 +2265,7 @@ TABLE *Delayed_insert::get_local_table(THD* client_thd)
   TABLE *copy;
   TABLE_SHARE *share;
   uchar *bitmap;
+  char *copy_tmp;
   DBUG_ENTER("Delayed_insert::get_local_table");
 
   /* First request insert thread to get a lock */
@@ -2266,14 +2314,15 @@ TABLE *Delayed_insert::get_local_table(THD* client_thd)
     the other record buffers and alignment are unnecessary.
   */
   thd_proc_info(client_thd, "allocating local table");
-  copy= (TABLE*) client_thd->alloc(sizeof(*copy)+
-				   (share->fields+1)*sizeof(Field**)+
-				   share->reclength +
-                                   share->column_bitmap_size*3);
-  if (!copy)
+  copy_tmp= (char*) client_thd->alloc(sizeof(*copy)+
+                                      (share->fields+1)*sizeof(Field**)+
+                                      share->reclength +
+                                      share->column_bitmap_size*3);
+  if (!copy_tmp)
     goto error;
 
   /* Copy the TABLE object. */
+  copy= new (copy_tmp) TABLE;
   *copy= *table;
   /* We don't need to change the file handler here */
   /* Assign the pointers for the field pointers array and the record. */
@@ -2385,8 +2434,10 @@ int write_delayed(THD *thd, TABLE *table, enum_duplicates duplic,
   if (!(row->record= (char*) my_malloc(table->s->reclength, MYF(MY_WME))))
     goto err;
   memcpy(row->record, table->record[0], table->s->reclength);
-  row->start_time=		thd->start_time;
-  row->query_start_used=	thd->query_start_used;
+  row->start_time=                thd->start_time;
+  row->query_start_used=          thd->query_start_used;
+  row->start_time_sec_part=       thd->start_time_sec_part;
+  row->query_start_sec_part_used= thd->query_start_sec_part_used;
   /*
     those are for the binlog: LAST_INSERT_ID() has been evaluated at this
     time, so record does not need it, but statement-based binlogging of the
@@ -2802,13 +2853,14 @@ pthread_handler_t handle_delayed_insert(void *arg)
     DBUG_LEAVE;
   }
 
-  close_thread_tables(thd);			// Free the table
-  thd->mdl_context.release_transactional_locks();
   di->table=0;
   thd->killed= THD::KILL_CONNECTION;	        // If error
-  mysql_cond_broadcast(&di->cond_client);       // Safety
   mysql_mutex_unlock(&di->mutex);
 
+  close_thread_tables(thd);			// Free the table
+  thd->mdl_context.release_transactional_locks();
+  mysql_cond_broadcast(&di->cond_client);       // Safety
+
   mysql_mutex_lock(&LOCK_delayed_create);       // Because of delayed_get_table
   mysql_mutex_lock(&LOCK_delayed_insert);
   /*
@@ -2909,6 +2961,8 @@ bool Delayed_insert::handle_inserts(void)
 
     thd.start_time=row->start_time;
     thd.query_start_used=row->query_start_used;
+    thd.start_time_sec_part=row->start_time_sec_part;
+    thd.query_start_sec_part_used=row->query_start_sec_part_used;
     /*
       To get the exact auto_inc interval to store in the binlog we must not
       use values from the previous interval (of the previous rows).
@@ -3128,9 +3182,9 @@ bool mysql_insert_select_prepare(THD *thd)
 {
   LEX *lex= thd->lex;
   SELECT_LEX *select_lex= &lex->select_lex;
-  TABLE_LIST *first_select_leaf_table;
   DBUG_ENTER("mysql_insert_select_prepare");
 
+
   /*
     SELECT_LEX do not belong to INSERT statement, so we can't add WHERE
     clause if table is VIEW
@@ -3143,21 +3197,38 @@ bool mysql_insert_select_prepare(THD *thd)
                            &select_lex->where, TRUE, FALSE, FALSE))
     DBUG_RETURN(TRUE);
 
+  DBUG_ASSERT(select_lex->leaf_tables.elements != 0);
+  List_iterator<TABLE_LIST> ti(select_lex->leaf_tables);
+  TABLE_LIST *table;
+  uint insert_tables;
+
+  if (select_lex->first_cond_optimization)
+  {
+    /* Back up leaf_tables list. */
+    Query_arena *arena= thd->stmt_arena, backup;
+    arena= thd->activate_stmt_arena_if_needed(&backup);  // For easier test
+
+    insert_tables= select_lex->insert_tables;
+    while ((table= ti++) && insert_tables--)
+    {
+      select_lex->leaf_tables_exec.push_back(table);
+      table->tablenr_exec= table->table->tablenr;
+      table->map_exec= table->table->map;
+      table->maybe_null_exec= table->table->maybe_null;
+    }
+    if (arena)
+      thd->restore_active_arena(arena, &backup);
+  }
+  ti.rewind();
   /*
     exclude first table from leaf tables list, because it belong to
     INSERT
   */
-  DBUG_ASSERT(select_lex->leaf_tables != 0);
-  lex->leaf_tables_insert= select_lex->leaf_tables;
   /* skip all leaf tables belonged to view where we are insert */
-  for (first_select_leaf_table= select_lex->leaf_tables->next_leaf;
-       first_select_leaf_table &&
-       first_select_leaf_table->belong_to_view &&
-       first_select_leaf_table->belong_to_view ==
-       lex->leaf_tables_insert->belong_to_view;
-       first_select_leaf_table= first_select_leaf_table->next_leaf)
-  {}
-  select_lex->leaf_tables= first_select_leaf_table;
+  insert_tables= select_lex->insert_tables;
+  while ((table= ti++) && insert_tables--)
+    ti.remove();
+
   DBUG_RETURN(FALSE);
 }
 
@@ -3375,7 +3446,7 @@ void select_insert::cleanup()
 select_insert::~select_insert()
 {
   DBUG_ENTER("~select_insert");
-  if (table)
+  if (table && table->created)
   {
     table->next_number_field=0;
     table->auto_increment_field_not_null= FALSE;
@@ -3387,7 +3458,7 @@ select_insert::~select_insert()
 }
 
 
-bool select_insert::send_data(List<Item> &values)
+int select_insert::send_data(List<Item> &values)
 {
   DBUG_ENTER("select_insert::send_data");
   bool error=0;
@@ -3687,9 +3758,6 @@ static TABLE *create_table_from_items(THD *thd, HA_CREATE_INFO *create_info,
 
   tmp_table.s->db_create_options=0;
   tmp_table.s->blob_ptr_size= portable_sizeof_char_ptr;
-  tmp_table.s->db_low_byte_first= 
-        test(create_info->db_type == myisam_hton ||
-             create_info->db_type == heap_hton);
   tmp_table.null_row= 0;
   tmp_table.maybe_null= 0;
 
@@ -3777,7 +3845,11 @@ static TABLE *create_table_from_items(THD *thd, HA_CREATE_INFO *create_info,
       }
     }
     if (!table)                                   // open failed
+    {
+      if (!thd->is_error())                     // CREATE ... IF NOT EXISTS
+        my_ok(thd);                             //   succeed, but did nothing
       DBUG_RETURN(0);
+    }
   }
 
   DBUG_EXECUTE_IF("sleep_create_select_before_lock", my_sleep(6000000););
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index 7a08dadf049..b0fdb7a1c42 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -34,6 +34,7 @@
 
 #define NO_MORE_RECORDS_IN_BUFFER  (uint)(-1)
 
+static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save);
 
 /*****************************************************************************
  *  Join cache module
@@ -58,7 +59,7 @@
     The function ignores the fields 'blob_length' and 'ofset' of the
     descriptor.
 
-  RETURN
+  RETURN VALUE
     the length of the field  
 */
 
@@ -99,7 +100,7 @@ uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field)
     descriptor  while 'descr_ptr' points to the position right after the
     last added pointer.  
 
-  RETURN
+  RETURN VALUE
     the total length of the added fields  
 */
 
@@ -138,7 +139,6 @@ uint add_table_data_fields_to_join_cache(JOIN_TAB *tab,
   *descr_ptr= copy_ptr;
   return len;
 }
-    
 
 /* 
   Determine different counters of fields associated with a record in the cache  
@@ -153,16 +153,61 @@ uint add_table_data_fields_to_join_cache(JOIN_TAB *tab,
     The function sets 'with_match_flag' on if 'join_tab' needs a match flag
     i.e. if it is the first inner table of an outer join or a semi-join.  
 
-  RETURN
+  RETURN VALUE
     none 
 */
 
 void JOIN_CACHE::calc_record_fields()
 {
-  JOIN_TAB *tab = prev_cache ? prev_cache->join_tab :
-                               join->join_tab+join->const_tables;
-  tables= join_tab-tab;
+  JOIN_TAB *tab;
+
+  if (prev_cache)
+    tab= prev_cache->join_tab;
+  else
+  {
+    if (join_tab->bush_root_tab)
+    {
+      /* 
+        --ot1--SJM1--------------ot2--...
+                |
+                |
+                +-it1--...--itN
+                        ^____________ this->join_tab is somewhere here, 
+                                      inside an sjm nest.
+
+        The join buffer should store the values of it1.*, it2.*, ..
+        It should not store values of ot1.*.
+      */
+      tab= join_tab->bush_root_tab->bush_children->start;
+    }
+    else
+    {
+      /*
+        -ot1--ot2--SJM1--SJM2--------------ot3--...--otN
+                    |     |                      ^   
+                    |     +-it21--...--it2N      |
+                    |                            \-- we're somewhere here,
+                    +-it11--...--it1N                at the top level
+        
+        The join buffer should store the values of 
+
+          ot1.*, ot2.*, it1{i}, it2{j}.*, ot3.*, ...
+        
+        that is, we should start from the first non-const top-level table. 
+
+        We will need to store columns of SJ-inner tables (it_X_Y.*), but we're
+        not interested in storing the columns of materialization tables
+        themselves. Beause of that, if the first non-const top-level table is a
+        materialized table, we move to its bush_children:
+      */
+      tab= join->join_tab + join->const_tables;
+      if (tab->bush_children)
+        tab= tab->bush_children->start;
+    }
+  }
+  DBUG_ASSERT(!tab->bush_children);
 
+  start_tab= tab;
   fields= 0;
   blobs= 0;
   flag_fields= 0;
@@ -170,9 +215,13 @@ void JOIN_CACHE::calc_record_fields()
   data_field_ptr_count= 0;
   referenced_fields= 0;
 
-  for ( ; tab < join_tab ; tab++)
+  /*
+    The following loop will get inside SJM nests, because data may be unpacked
+    to sjm-inner tables.
+  */
+  for (; tab != join_tab ; tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
   {	    
-    calc_used_field_length(join->thd, tab);
+    tab->calc_used_field_length(FALSE);
     flag_fields+= test(tab->used_null_fields || tab->used_uneven_bit_fields);
     flag_fields+= test(tab->table->maybe_null);
     fields+= tab->used_fields;
@@ -185,6 +234,74 @@ void JOIN_CACHE::calc_record_fields()
   fields+= flag_fields;
 }
 
+
+/* 
+  Collect information on join key arguments  
+
+  SYNOPSIS
+    collect_info_on_key_args()
+
+  DESCRIPTION
+    The function traverses the ref expressions that are used to access the
+    joined table join_tab. For each table 'tab' whose fields are to be stored
+    in the join buffer of the cache the function finds the fields from 'tab'
+    that occur in the ref expressions and marks these fields in the bitmap
+    tab->table->tmp_set. The function counts the number of them stored
+    in this cache and the total number of them stored in the previous caches
+    and saves the results of the counting in 'local_key_arg_fields' and
+    'external_key_arg_fields' respectively.
+
+  NOTES
+    The function does not do anything if no key is used to join the records
+    from join_tab.
+    
+  RETURN VALUE
+    none 
+*/  
+
+void JOIN_CACHE::collect_info_on_key_args()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+  local_key_arg_fields= 0;
+  external_key_arg_fields= 0;
+
+  if (!is_key_access())
+    return;
+
+  TABLE_REF *ref= &join_tab->ref;
+  cache= this;
+  do
+  {
+    for (tab= cache->start_tab; tab != cache->join_tab;
+         tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+    { 
+      uint key_args;
+      bitmap_clear_all(&tab->table->tmp_set);
+      for (uint i= 0; i < ref->key_parts; i++)
+      {
+        Item *ref_item= ref->items[i]; 
+        if (!(tab->table->map & ref_item->used_tables()))
+	  continue;
+	 ref_item->walk(&Item::add_field_to_set_processor, 1,
+                        (uchar *) tab->table);
+      }
+      if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
+      {
+        if (cache == this)
+          local_key_arg_fields+= key_args;
+        else
+          external_key_arg_fields+= key_args;
+      }
+    }
+    cache= cache->prev_cache;
+  } 
+  while (cache);
+
+  return;
+}
+
+
 /* 
   Allocate memory for descriptors and pointers to them associated with the cache  
 
@@ -196,23 +313,22 @@ void JOIN_CACHE::calc_record_fields()
     and the array of pointers to the field descriptors used to copy
     join record data from record buffers into the join buffer and
     backward. Some pointers refer to the field descriptor associated
-    with previous caches. They are placed at the beginning of the
-    array of pointers and its total number is specified by the parameter
-    'external fields'.
-    The pointer of the first array is assigned to field_descr and the 
-    number of elements is precalculated by the function calc_record_fields. 
+    with previous caches. They are placed at the beginning of the array
+    of pointers and its total number is stored in external_key_arg_fields.
+    The pointer of the first array is assigned to field_descr and the number
+    of the elements in it is precalculated by the function calc_record_fields. 
     The allocated arrays are adjacent.
   
   NOTES
     The memory is allocated in join->thd->memroot
 
-  RETURN
+  RETURN VALUE
     pointer to the first array  
 */
 
-int JOIN_CACHE::alloc_fields(uint external_fields)
+int JOIN_CACHE::alloc_fields()
 {
-  uint ptr_cnt= external_fields+blobs+1;
+  uint ptr_cnt= external_key_arg_fields+blobs+1;
   uint fields_size= sizeof(CACHE_FIELD)*fields;
   field_descr= (CACHE_FIELD*) sql_alloc(fields_size +
                                         sizeof(CACHE_FIELD*)*ptr_cnt);
@@ -220,6 +336,7 @@ int JOIN_CACHE::alloc_fields(uint external_fields)
   return (field_descr == NULL);
 }  
 
+
 /* 
   Create descriptors of the record flag fields stored in the join buffer 
 
@@ -253,7 +370,7 @@ int JOIN_CACHE::alloc_fields(uint external_fields)
     The function sets the value of 'length' to the total length of the
     flag fields.
   
-  RETURN
+  RETURN VALUE
     none
 */
 
@@ -273,7 +390,8 @@ void JOIN_CACHE::create_flag_fields()
 	                                  &copy);
 
   /* Create fields for all null bitmaps and null row flags that are needed */
-  for (tab= join_tab-tables; tab < join_tab; tab++)
+  for (tab= start_tab; tab != join_tab; 
+       tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
   {
     TABLE *table= tab->table;
 
@@ -296,17 +414,146 @@ void JOIN_CACHE::create_flag_fields()
 
 
 /* 
+  Create descriptors of the fields used to build access keys to the joined table
+
+  SYNOPSIS
+    create_key_arg_fields()
+
+  DESCRIPTION
+    The function creates descriptors of the record fields stored in the join
+    buffer that are used to build access keys to the joined table. These
+    fields are put into the buffer ahead of other records fields stored in
+    the buffer. Such placement helps to optimize construction of access keys.
+    For each field that is used to build access keys to the joined table but
+    is stored in some other join cache buffer the function saves a pointer
+    to the the field descriptor. The array of such pointers are placed in the
+    the join cache structure just before the array of pointers to the
+    blob fields blob_ptr.
+    Any field stored in a join cache buffer that is used to construct keys
+    to access tables associated with other join caches is called a referenced
+    field. It receives a unique number that is saved by the function in the
+    member 'referenced_field_no' of the CACHE_FIELD descriptor for the field.
+    This number is used as index to the array of offsets to the referenced
+    fields that are saved and put in the join cache buffer after all record
+    fields.
+    The function also finds out whether that the keys to access join_tab
+    can be considered as embedded and, if so, sets the flag 'use_emb_key' in
+    this join cache appropriately. 
+     
+  NOTES.
+    When a key to access the joined table 'join_tab' is constructed the array
+    of pointers to the field descriptors for the external fields is looked
+    through. For each of this pointers we find out in what previous key cache
+    the referenced field is stored. The value of 'referenced_field_no'
+    provides us with the index into the array of offsets for referenced 
+    fields stored in the join cache. The offset read by the the index allows
+    us to read the field without reading all other fields of the record 
+    stored the join cache buffer. This optimizes the construction of keys
+    to access 'join_tab' when some key arguments are stored in the previous
+    join caches.  
+
+  NOTES
+    The function does not do anything if no key is used to join the records
+    from join_tab.
+ 
+  RETURN VALUE
+    none
+*/
+void JOIN_CACHE::create_key_arg_fields()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+
+  if (!is_key_access())
+    return;
+
+  /* 
+    Save pointers to the cache fields in previous caches
+    that  are used to build keys for this key access.
+  */
+  cache= this;
+  uint ext_key_arg_cnt= external_key_arg_fields;
+  CACHE_FIELD *copy;
+  CACHE_FIELD **copy_ptr= blob_ptr;
+  while (ext_key_arg_cnt)
+  {
+    cache= cache->prev_cache;
+    for (tab= cache->start_tab; tab != cache->join_tab; 
+         tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+    { 
+      CACHE_FIELD *copy_end;
+      MY_BITMAP *key_read_set= &tab->table->tmp_set;
+      /* key_read_set contains the bitmap of tab's fields referenced by ref */ 
+      if (bitmap_is_clear_all(key_read_set))
+        continue;
+      copy_end= cache->field_descr+cache->fields;
+      for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
+      {
+        /*
+          (1) - when we store rowids for DuplicateWeedout, they have
+                copy->field==NULL
+        */
+        if (copy->field &&  // (1)
+            copy->field->table == tab->table &&
+            bitmap_is_set(key_read_set, copy->field->field_index))
+        {
+          *copy_ptr++= copy; 
+          ext_key_arg_cnt--;
+          if (!copy->referenced_field_no)
+          {
+            /* 
+              Register the referenced field 'copy': 
+              - set the offset number in copy->referenced_field_no,
+              - adjust the value of the flag 'with_length',
+              - adjust the values of 'pack_length' and 
+                of 'pack_length_with_blob_ptrs'.
+	    */
+            copy->referenced_field_no= ++cache->referenced_fields;
+            if (!cache->with_length)
+            {
+              cache->with_length= TRUE;
+              uint sz= cache->get_size_of_rec_length();
+              cache->base_prefix_length+= sz;
+              cache->pack_length+= sz;
+              cache->pack_length_with_blob_ptrs+= sz;
+            }
+	    cache->pack_length+= cache->get_size_of_fld_offset();
+            cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
+          }        
+        }
+      }
+    } 
+  }
+  /* After this 'blob_ptr' shall not be be changed */ 
+  blob_ptr= copy_ptr;
+  
+  /* Now create local fields that are used to build ref for this key access */
+  copy= field_descr+flag_fields;
+  for (tab= start_tab; tab != join_tab; 
+       tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+  {
+    length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
+                                                 &data_field_count, &copy,
+                                                 &data_field_ptr_count, 
+                                                 &copy_ptr);
+  }
+
+  use_emb_key= check_emb_key_usage();
+
+  return;
+}
+
+
+/* 
   Create descriptors of all remaining data fields stored in the join buffer    
 
   SYNOPSIS
     create_remaining_fields()
-      all_read_fields   indicates that descriptors for all read data fields
-                        are to be created
 
   DESCRIPTION
     The function creates descriptors for all remaining data fields of a
-    record from the join buffer. If the parameter 'all_read_fields' is
-    true the function creates fields for all read record fields that
+    record from the join buffer. If the value returned by is_key_access() is
+    false the function creates fields for all read record fields that
     comprise the partial join record joined with join_tab. Otherwise, 
     for each table tab, the set of the read fields for which the descriptors
     have to be added is determined as the difference between all read fields
@@ -316,7 +563,7 @@ void JOIN_CACHE::create_flag_fields()
     the added fields.
    
   NOTES
-    If 'all_read_fields' is false the function modifies the value of
+    If is_key_access() returns true the function modifies the value of
     tab->table->tmp_set for a each table whose fields are stored in the cache.
     The function calls the method Field::fill_cache_field to figure out
     the type of the cache field and the maximal length of its representation
@@ -328,17 +575,19 @@ void JOIN_CACHE::create_flag_fields()
     contains the number of the pointers to such descriptors having been
     stored up to the moment.
 
-  RETURN
+  RETURN VALUE
     none 
 */
 
-void JOIN_CACHE:: create_remaining_fields(bool all_read_fields)
+void JOIN_CACHE::create_remaining_fields()
 {
   JOIN_TAB *tab;
+  bool all_read_fields= !is_key_access();
   CACHE_FIELD *copy= field_descr+flag_fields+data_field_count;
   CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count;
 
-  for (tab= join_tab-tables; tab < join_tab; tab++)
+  for (tab= start_tab; tab != join_tab; 
+       tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
   {
     MY_BITMAP *rem_field_set;
     TABLE *table= tab->table;
@@ -361,8 +610,15 @@ void JOIN_CACHE:: create_remaining_fields(bool all_read_fields)
     if (tab->keep_current_rowid)
     {
       copy->str= table->file->ref;
-      copy->length= table->file->ref_length;
-      copy->type= 0;
+      if (copy->str)
+        copy->length= table->file->ref_length;
+      else
+      {
+        /* This may happen only for materialized derived tables and views */
+        copy->length= 0;
+        copy->str= (uchar *) table;
+      } 
+      copy->type= CACHE_ROWID;
       copy->field= 0;
       copy->referenced_field_no= 0;
       length+= copy->length;
@@ -373,6 +629,7 @@ void JOIN_CACHE:: create_remaining_fields(bool all_read_fields)
 }
 
 
+
 /* 
   Calculate and set all cache constants      
 
@@ -390,7 +647,7 @@ void JOIN_CACHE:: create_remaining_fields(bool all_read_fields)
     making a dicision whether more records should be added into the join
     buffer or not.
   
-  RETURN
+  RETURN VALUE
     none 
 */
 
@@ -425,6 +682,8 @@ void JOIN_CACHE::set_constants()
   size_of_rec_ofs= offset_size(buff_size);
   size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len); 
   size_of_fld_ofs= size_of_rec_len;
+  base_prefix_length= (with_length ? size_of_rec_len : 0) +
+                      (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
   /* 
     The size of the offsets for referenced fields will be added later.
     The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted
@@ -438,236 +697,357 @@ void JOIN_CACHE::set_constants()
 
 
 /* 
-  Allocate memory for a join buffer      
+  Get maximum total length of all affixes of a record in the join cache buffer
 
   SYNOPSIS
-    alloc_buffer()
+    get_record_max_affix_length()
 
   DESCRIPTION
-    The function allocates a lump of memory for the cache join buffer. The
-    size of the allocated memory is 'buff_size' bytes. 
-  
-  RETURN
-    0 - if the memory has been successfully allocated
-    1 - otherwise
+    The function calculates the maximum possible total length of all affixes
+    of a record in the join cache buffer, that is made of:
+      - the length of all prefixes used in this cache,
+      - the length of the match flag if it's needed
+      - the total length of the maximum possible offsets to the fields of
+        a record in the buffer.
+
+  RETURN VALUE
+    The maximum total length of all affixes of a record in the join buffer  
+*/ 
+     
+uint JOIN_CACHE::get_record_max_affix_length()
+{
+  uint len= get_prefix_length() +
+            test(with_match_flag) + 
+            size_of_fld_ofs * data_field_count;
+  return len;
+}
+
+
+/* 
+  Get the minimum possible size of the cache join buffer 
+
+  SYNOPSIS
+    get_min_join_buffer_size()
+
+  DESCRIPTION
+    At the first its invocation for the cache the function calculates the
+    minimum possible size of the join buffer of the cache. This value depends
+    on the minimal number of records 'min_records' to be stored in the join
+    buffer. The number is supposed to be determined by the procedure that 
+    chooses the best access path to the joined table join_tab in the execution
+    plan. After the calculation of the interesting size the function saves it
+    in the field 'min_buff_size' in order to use it directly at the next     
+    invocations of the function.
+
+  NOTES
+    Currently the number of minimal records is just set to 1.
+
+  RETURN VALUE
+    The minimal possible size of the join buffer of this cache 
 */
 
-int JOIN_CACHE::alloc_buffer()
+ulong JOIN_CACHE::get_min_join_buffer_size()
 {
-  buff= (uchar*) my_malloc(buff_size, MYF(0));
-  return buff == NULL;
-}    	
-  
+  if (!min_buff_size)
+  {
+    size_t len= 0;
+    for (JOIN_TAB *tab= start_tab; tab != join_tab; 
+         tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+    {
+      len+= tab->get_max_used_fieldlength();
+    }
+    len+= get_record_max_affix_length() + get_max_key_addon_space_per_record();  
+    size_t min_sz= len*min_records;
+    size_t add_sz= 0;
+    for (uint i=0; i < min_records; i++)
+      add_sz+= join_tab_scan->aux_buffer_incr(i+1);
+    avg_aux_buffer_incr= add_sz/min_records;
+    min_sz+= add_sz;
+    min_sz+= pack_length_with_blob_ptrs;
+    set_if_bigger(min_sz, 1);
+    min_buff_size= min_sz;
+  }
+  return min_buff_size;
+}
+
 
 /* 
-  Initialize a BNL cache       
+  Get the maximum possible size of the cache join buffer 
 
   SYNOPSIS
-    init()
+    get_max_join_buffer_size()
+
+    optimize_buff_size  FALSE <-> do not take more memory than needed for
+                        the estimated number of records in the partial join 
 
   DESCRIPTION
-    The function initializes the cache structure. It supposed to be called
-    right after a constructor for the JOIN_CACHE_BNL.
-    The function allocates memory for the join buffer and for descriptors of
-    the record fields stored in the buffer.
+    At the first its invocation for the cache the function calculates the
+    maximum possible size of join buffer for the cache. If the parameter
+    optimize_buff_size true then this value does not exceed the size of the
+    space needed for the estimated number of records 'max_records' in the
+    partial join that joins tables from the first one through join_tab. This
+    value is also capped off by the value of join_tab->join_buffer_size_limit,
+    if it has been set a to non-zero value, and by the value of the system
+    parameter join_buffer_size - otherwise. After the calculation of the
+    interesting size the function saves the value in the field 'max_buff_size'
+    in order to use it directly at the next  invocations of the function.
 
   NOTES
-    The code of this function should have been included into the constructor
-    code itself. However the new operator for the class JOIN_CACHE_BNL would
-    never fail while memory allocation for the join buffer is not absolutely
-    unlikely to fail. That's why this memory allocation has to be placed in a
-    separate function that is called in a couple with a cache constructor.
-    It is quite natural to put almost all other constructor actions into
-    this function.     
-  
-  RETURN
-    0   initialization with buffer allocations has been succeeded
-    1   otherwise
+    Currently the value of join_tab->join_buffer_size_limit is initialized
+    to 0 and is never reset.
+
+  RETURN VALUE
+    The maximum possible size of the join buffer of this cache 
 */
 
-int JOIN_CACHE_BNL::init()
+ulong JOIN_CACHE::get_max_join_buffer_size(bool optimize_buff_size)
 {
-  DBUG_ENTER("JOIN_CACHE::init");
+  if (!max_buff_size)
+  {
+    size_t max_sz;
+    size_t min_sz= get_min_join_buffer_size(); 
+    size_t len= 0;
+    for (JOIN_TAB *tab= start_tab; tab != join_tab;
+         tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+    {
+      len+= tab->get_used_fieldlength();
+    }
+    len+= get_record_max_affix_length();
+    avg_record_length= len;
+    len+= get_max_key_addon_space_per_record() + avg_aux_buffer_incr;
+    space_per_record= len;
+    
+    size_t limit_sz= join->thd->variables.join_buff_size;
+    if (join_tab->join_buffer_size_limit)
+      set_if_smaller(limit_sz, join_tab->join_buffer_size_limit);
+    if (!optimize_buff_size)
+      max_sz= limit_sz;
+    else
+    {    
+      if (limit_sz / max_records > space_per_record)
+        max_sz= space_per_record * max_records;
+      else
+        max_sz= limit_sz;
+      max_sz+= pack_length_with_blob_ptrs;
+      set_if_smaller(max_sz, limit_sz);
+    }
+    set_if_bigger(max_sz, min_sz);
+    max_buff_size= max_sz;
+  }
+  return max_buff_size;
+}    
+      
 
-  calc_record_fields();
+/* 
+  Allocate memory for a join buffer      
 
-  if (alloc_fields(0))
-    DBUG_RETURN(1);
+  SYNOPSIS
+    alloc_buffer()
 
-  create_flag_fields();
+  DESCRIPTION
+    The function allocates a lump of memory for the cache join buffer. 
+    Initially the function sets the size of the buffer buff_size equal to
+    the value returned by get_max_join_buffer_size(). If the total size of
+    the space intended to be used for the join buffers employed by the
+    tables from the first one through join_tab exceeds the value of the
+    system parameter join_buff_space_limit, then the function first tries
+    to shrink the used buffers to make the occupied space fit the maximum
+    memory allowed to be used for all join buffers in total. After
+    this the function tries to allocate a join buffer for join_tab.
+    If it fails to do so, it decrements the requested size of the join
+    buffer, shrinks proportionally the join buffers used for the previous
+    tables and tries to allocate a buffer for join_tab. In the case of a
+    failure the function repeats its attempts with smaller and smaller
+    requested sizes of the buffer, but not more than 4 times.
   
-  create_remaining_fields(TRUE);
+  RETURN VALUE
+    0   if the memory has been successfully allocated
+    1   otherwise
+*/
 
-  set_constants();
+int JOIN_CACHE::alloc_buffer()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+  ulonglong curr_buff_space_sz= 0;
+  ulonglong curr_min_buff_space_sz= 0;
+  ulonglong join_buff_space_limit=
+    join->thd->variables.join_buff_space_limit;
+  bool optimize_buff_size= 
+         optimizer_flag(join->thd, OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE);
+  double partial_join_cardinality=  (join_tab-1)->get_partial_join_cardinality();
+  buff= NULL;
+  min_buff_size= 0;
+  max_buff_size= 0;
+  min_records= 1;
+  max_records= (size_t) (partial_join_cardinality <= join_buff_space_limit ?
+                 (ulonglong) partial_join_cardinality : join_buff_space_limit);
+  set_if_bigger(max_records, 10);
+  min_buff_size= get_min_join_buffer_size();
+  buff_size= get_max_join_buffer_size(optimize_buff_size);
+
+  for (tab= start_tab; tab!= join_tab; 
+       tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
+  {
+    cache= tab->cache;
+    if (cache)
+    {
+      curr_min_buff_space_sz+= cache->get_min_join_buffer_size();
+      curr_buff_space_sz+= cache->get_join_buffer_size();
+    }
+  }
 
-  if (alloc_buffer())
-    DBUG_RETURN(1); 
-  
-  reset(TRUE); 
+  if (curr_min_buff_space_sz > join_buff_space_limit ||
+      (curr_buff_space_sz > join_buff_space_limit &&
+       (!optimize_buff_size || 
+        join->shrink_join_buffers(join_tab, curr_buff_space_sz,
+                                  join_buff_space_limit))))
+    goto fail;
+                               
+  for (ulong buff_size_decr= (buff_size-min_buff_size)/4 + 1; ; )
+  {
+    ulong next_buff_size;
 
-  DBUG_RETURN(0);
+    if ((buff= (uchar*) my_malloc(buff_size, MYF(0))))
+      break;
+
+    next_buff_size= buff_size > buff_size_decr ? buff_size-buff_size_decr : 0;
+    if (next_buff_size < min_buff_size ||
+        join->shrink_join_buffers(join_tab, curr_buff_space_sz,
+                                  curr_buff_space_sz-buff_size_decr))
+      goto fail;
+    buff_size= next_buff_size;
+
+    curr_buff_space_sz= 0;
+    for (tab= join->join_tab+join->const_tables; tab <= join_tab; tab++)
+    {
+      cache= tab->cache;
+      if (cache)
+        curr_buff_space_sz+= cache->get_join_buffer_size();
+    } 
+  }
+  return 0;
+
+fail:
+  buff_size= 0;
+  return 1;
 }
 
+ 
+/*
+  Shrink the size if the cache join buffer in a given ratio
+
+  SYNOPSIS
+    shrink_join_buffer_in_ratio()
+      n           nominator of the ratio to shrink the buffer in
+      d           denominator if the ratio
+
+  DESCRIPTION
+    The function first deallocates the join buffer of the cache. Then
+    it allocates a buffer that is (n/d) times smaller.
+    
+  RETURN VALUE
+    FALSE   on success with allocation of the smaller join buffer 
+    TRUE    otherwise       
+*/
+
+bool JOIN_CACHE::shrink_join_buffer_in_ratio(ulonglong n, ulonglong d)
+{
+  size_t next_buff_size;
+  if (n < d)
+    return FALSE;
+  next_buff_size= (size_t) ((double) buff_size / n * d);
+  set_if_bigger(next_buff_size, min_buff_size);
+  buff_size= next_buff_size;
+  return realloc_buffer();
+}  
+
+
+/*
+  Reallocate the join buffer of a join cache
+ 
+  SYNOPSIS
+    realloc_buffer()
+
+  DESCRITION
+    The function reallocates the join buffer of the join cache. After this
+    it resets the buffer for writing.
+
+  NOTES
+    The function assumes that buff_size contains the new value for the join
+    buffer size.  
+
+  RETURN VALUE
+    0   if the buffer has been successfully reallocated
+    1   otherwise
+*/
+
+int JOIN_CACHE::realloc_buffer()
+{
+  int rc;
+  free();
+  rc= test(!(buff= (uchar*) my_malloc(buff_size, MYF(0))));
+  reset(TRUE);
+  return rc;   	
+}
+  
 
 /* 
-  Initialize a BKA cache       
+  Initialize a join cache       
 
   SYNOPSIS
     init()
 
   DESCRIPTION
-    The function initializes the cache structure. It supposed to be called
-    right after a constructor for the JOIN_CACHE_BKA.
+    The function initializes the join cache structure. It supposed to be called
+    by init methods for classes derived from the JOIN_CACHE.
     The function allocates memory for the join buffer and for descriptors of
     the record fields stored in the buffer.
 
   NOTES
     The code of this function should have been included into the constructor
-    code itself. However the new operator for the class JOIN_CACHE_BKA would
+    code itself. However the new operator for the class JOIN_CACHE would
     never fail while memory allocation for the join buffer is not absolutely
     unlikely to fail. That's why this memory allocation has to be placed in a
     separate function that is called in a couple with a cache constructor.
     It is quite natural to put almost all other constructor actions into
     this function.     
   
-  RETURN
+  RETURN VALUE
     0   initialization with buffer allocations has been succeeded
     1   otherwise
 */
 
-int JOIN_CACHE_BKA::init()
+int JOIN_CACHE::init()
 {
-  JOIN_TAB *tab;
-  JOIN_CACHE *cache;
-  local_key_arg_fields= 0;
-  external_key_arg_fields= 0;
-  DBUG_ENTER("JOIN_CACHE_BKA::init");
+  DBUG_ENTER("JOIN_CACHE::init");
 
   calc_record_fields();
 
-  /* Mark all fields that can be used as arguments for this key access */
-  TABLE_REF *ref= &join_tab->ref;
-  cache= this;
-  do
-  {
-    /* 
-      Traverse the ref expressions and find the occurrences of fields in them for
-      each table 'tab' whose fields are to be stored in the 'cache' join buffer.
-      Mark these fields in the bitmap tab->table->tmp_set.
-      For these fields count the number of them stored in this cache and the
-      total number of them stored in the previous caches. Save the result
-      of the counting 'in local_key_arg_fields' and 'external_key_arg_fields'
-      respectively.
-    */ 
-    for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
-    { 
-      uint key_args;
-      bitmap_clear_all(&tab->table->tmp_set);
-      for (uint i= 0; i < ref->key_parts; i++)
-      {
-        Item *ref_item= ref->items[i]; 
-        if (!(tab->table->map & ref_item->used_tables()))
-	  continue;
-	 ref_item->walk(&Item::add_field_to_set_processor, 1,
-                        (uchar *) tab->table);
-      }
-      if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
-      {
-        if (cache == this)
-          local_key_arg_fields+= key_args;
-        else
-          external_key_arg_fields+= key_args;
-      }
-    }
-    cache= cache->prev_cache;
-  } 
-  while (cache);
+  collect_info_on_key_args();
 
-  if (alloc_fields(external_key_arg_fields))
+  if (alloc_fields())
     DBUG_RETURN(1);
 
   create_flag_fields();
-  
-  /* 
-    Save pointers to the cache fields in previous caches
-    that  are used to build keys for this key access.
-  */
-  cache= this;
-  uint ext_key_arg_cnt= external_key_arg_fields;
-  CACHE_FIELD *copy;
-  CACHE_FIELD **copy_ptr= blob_ptr;
-  while (ext_key_arg_cnt)
-  {
-    cache= cache->prev_cache;
-    for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
-    { 
-      CACHE_FIELD *copy_end;
-      MY_BITMAP *key_read_set= &tab->table->tmp_set;
-      /* key_read_set contains the bitmap of tab's fields referenced by ref */ 
-      if (bitmap_is_clear_all(key_read_set))
-        continue;
-      copy_end= cache->field_descr+cache->fields;
-      for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
-      {
-        /*
-          (1) - when we store rowids for DuplicateWeedout, they have
-                copy->field==NULL
-        */
-        if (copy->field &&  // (1)
-            copy->field->table == tab->table &&
-            bitmap_is_set(key_read_set, copy->field->field_index))
-        {
-          *copy_ptr++= copy; 
-          ext_key_arg_cnt--;
-          if (!copy->referenced_field_no)
-          {
-            /* 
-              Register the referenced field 'copy': 
-              - set the offset number in copy->referenced_field_no,
-              - adjust the value of the flag 'with_length',
-              - adjust the values of 'pack_length' and 
-                of 'pack_length_with_blob_ptrs'.
-	    */
-            copy->referenced_field_no= ++cache->referenced_fields;
-            cache->with_length= TRUE;
-	    cache->pack_length+= cache->get_size_of_fld_offset();
-            cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
-          }        
-        }
-      }
-    } 
-  }
-  /* After this 'blob_ptr' shall not be be changed */ 
-  blob_ptr= copy_ptr;
-  
-  /* Now create local fields that are used to build ref for this key access */
-  copy= field_descr+flag_fields;
-  for (tab= join_tab-tables; tab < join_tab ; tab++)
-  {
-    length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
-                                                 &data_field_count, &copy,
-                                                 &data_field_ptr_count, 
-                                                 &copy_ptr);
-  }
 
-  use_emb_key= check_emb_key_usage();
+  create_key_arg_fields();
 
-  create_remaining_fields(FALSE);
+  create_remaining_fields();
 
   set_constants();
 
   if (alloc_buffer())
     DBUG_RETURN(1); 
-
-  reset(TRUE);
+  
+  reset(TRUE); 
 
   DBUG_RETURN(0);
-}  
+}
 
 
 /* 
   Check the possibility to read the access keys directly from the join buffer       
-
   SYNOPSIS
     check_emb_key_usage()
 
@@ -694,22 +1074,29 @@ int JOIN_CACHE_BKA::init()
     we still do not consider them embedded. In the future we'll expand the
     the class of keys which we identify as embedded.
 
-  RETURN
-    TRUE  - key values will be considered as embedded,
-    FALSE - otherwise.
+  NOTES
+    The function returns FALSE if no key is used to join the records
+    from join_tab.
+
+  RETURN VALUE
+    TRUE    key values will be considered as embedded,
+    FALSE   otherwise.
 */
 
-bool JOIN_CACHE_BKA::check_emb_key_usage()
+bool JOIN_CACHE::check_emb_key_usage()
 {
+
+  if (!is_key_access())
+    return FALSE;
+
   uint i;
   Item *item; 
   KEY_PART_INFO *key_part;
   CACHE_FIELD *copy;
   CACHE_FIELD *copy_end;
   uint len= 0;
-  TABLE *table= join_tab->table;
   TABLE_REF *ref= &join_tab->ref;
-  KEY *keyinfo= table->key_info+ref->key;
+  KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
 
   /* 
     If some of the key arguments are not from the local cache the key
@@ -801,110 +1188,6 @@ bool JOIN_CACHE_BKA::check_emb_key_usage()
 
 
 /* 
-  Calculate the increment of the MRR buffer for a record write       
-
-  SYNOPSIS
-    aux_buffer_incr()
-
-  DESCRIPTION
-    This implementation of the virtual function aux_buffer_incr determines
-    for how much the size of the MRR buffer should be increased when another
-    record is added to the cache.   
-
-  RETURN
-    the increment of the size of the MRR buffer for the next record
-*/
-
-uint JOIN_CACHE_BKA::aux_buffer_incr()
-{
-  uint incr= 0;
-  TABLE_REF *ref= &join_tab->ref;
-  TABLE *tab= join_tab->table;
-  uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1];
-  set_if_bigger(rec_per_key, 1);
-  if (records == 1)
-    incr=  ref->key_length + tab->file->ref_length;
-  incr+= tab->file->stats.mrr_length_per_rec * rec_per_key;
-  return incr; 
-}
-
-
-/*
-  Check if the record combination matches the index condition
-
-  SYNOPSIS
-    JOIN_CACHE_BKA::skip_index_tuple()
-      rseq             Value returned by bka_range_seq_init()
-      range_info       MRR range association data
-    
-  DESCRIPTION
-    This function is invoked from MRR implementation to check if an index
-    tuple matches the index condition. It is used in the case where the index
-    condition actually depends on both columns of the used index and columns
-    from previous tables.
-    
-    Accessing columns of the previous tables requires special handling with
-    BKA. The idea of BKA is to collect record combinations in a buffer and 
-    then do a batch of ref access lookups, i.e. by the time we're doing a
-    lookup its previous-records-combination is not in prev_table->record[0]
-    but somewhere in the join buffer.
-    
-    We need to get it from there back into prev_table(s)->record[0] before we
-    can evaluate the index condition, and that's why we need this function
-    instead of regular IndexConditionPushdown.
-
-  NOTE
-    Possible optimization:
-    Before we unpack the record from a previous table
-    check if this table is used in the condition.
-    If so then unpack the record otherwise skip the unpacking.
-    This should be done by a special virtual method
-    get_partial_record_by_pos().
-
-  RETURN
-    0    The record combination satisfies the index condition
-    1    Otherwise
-*/
-
-bool JOIN_CACHE_BKA::skip_index_tuple(range_seq_t rseq, char *range_info)
-{
-  DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
-  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
-  cache->get_record_by_pos((uchar*)range_info);
-  DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
-}
-
-
-/*
-  Check if the record combination matches the index condition
-
-  SYNOPSIS
-    bka_skip_index_tuple()
-      rseq             Value returned by bka_range_seq_init()
-      range_info       MRR range association data
-    
-  DESCRIPTION
-    This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
-    see comments there.
-
-  NOTE
-    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
- 
-  RETURN
-    0    The record combination satisfies the index condition
-    1    Otherwise
-*/
-
-static 
-bool bka_skip_index_tuple(range_seq_t rseq, char *range_info)
-{
-  DBUG_ENTER("bka_skip_index_tuple");
-  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
-  DBUG_RETURN(cache->skip_index_tuple(rseq, range_info));
-}
-
-
-/* 
   Write record fields and their required offsets into the join cache buffer
 
   SYNOPSIS
@@ -943,9 +1226,11 @@ bool bka_skip_index_tuple(range_seq_t rseq, char *range_info)
     The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data 
     remains in the record buffers and not copied to the join buffer. It may
     happen only to the blob data from the last record added into the cache.
-   
-    
-  RETURN
+    If on_precond is attached to join_tab and it is not evaluated to TRUE
+    then MATCH_IMPOSSIBLE is placed in the match flag field of the record
+    written into the join buffer.
+       
+  RETURN VALUE
     length of the written record data
 */
 
@@ -955,17 +1240,19 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
   bool last_record;
   CACHE_FIELD *copy;
   CACHE_FIELD *copy_end;
+  uchar *flags_pos;
   uchar *cp= pos;
   uchar *init_pos= cp;
   uchar *rec_len_ptr= 0;
+  uint key_extra= extra_key_length();
  
   records++;  /* Increment the counter of records in the cache */
 
-  len= pack_length;
+  len= pack_length + key_extra;
 
   /* Make an adjustment for the size of the auxiliary buffer if there is any */
-  uint incr= aux_buffer_incr();
-  ulong rem= rem_space();
+  uint incr= aux_buffer_incr(records);
+  size_t rem= rem_space();
   aux_buff_size+= len+incr < rem ? incr : rem;
 
   /*
@@ -1001,7 +1288,7 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
     This function is called only in the case when there is enough space left in
     the cache to store at least non-blob parts of the current record.
   */
-  last_record= (len+pack_length_with_blob_ptrs) > rem_space();
+  last_record= (len+pack_length_with_blob_ptrs+key_extra) > rem_space();
   
   /* 
     Save the position for the length of the record in the cache if it's needed.
@@ -1033,6 +1320,7 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
 
   /* First put into the cache the values of all flag fields */
   copy_end= field_descr+flag_fields;
+  flags_pos= cp;
   for ( ; copy < copy_end; copy++)
   {
     memcpy(cp, copy->str, copy->length);
@@ -1045,8 +1333,7 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
   {
     Field *field= copy->field;
     if (field && field->maybe_null() && field->is_null())
-    {
-      /* Do not copy a field if its value is null */
+    {    
       if (copy->referenced_field_no)
         copy->offset= 0;
       continue;              
@@ -1106,6 +1393,18 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
 	cp+= len+2;
         break;
       }
+      case CACHE_ROWID:
+        if (!copy->length)
+	{
+          /*
+            This may happen only for ROWID fields of materialized
+            derived tables and views.
+	  */
+	  TABLE *table= (TABLE *) copy->str;
+          copy->str= table->file->ref;
+          copy->length= table->file->ref_length;
+        }
+        /* fall through */
       default:      
         /* Copy the entire image of the field from the record buffer */
 	memcpy(cp, copy->str, copy->length);
@@ -1135,6 +1434,19 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
   last_rec_pos= curr_rec_pos; 
   end_pos= pos= cp;
   *is_full= last_record;
+
+  last_written_is_null_compl= 0;   
+  if (!join_tab->first_unmatched && join_tab->on_precond)
+  { 
+    join_tab->found= 0;
+    join_tab->not_null_compl= 1;
+    if (!join_tab->on_precond->val_int())
+    {
+      flags_pos[0]= MATCH_IMPOSSIBLE;     
+      last_written_is_null_compl= 1;
+    }
+  } 
+      
   return (uint) (cp-init_pos);
 }
 
@@ -1159,10 +1471,9 @@ uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
     - the size of the auxiliary buffer is reset to 0,
     - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0.
     
-  RETURN
+  RETURN VALUE
     none
 */
-
 void JOIN_CACHE::reset(bool for_writing)
 {
   pos= buff;
@@ -1177,6 +1488,7 @@ void JOIN_CACHE::reset(bool for_writing)
   }
 }
 
+
 /* 
   Add a record into the join buffer: the default implementation
 
@@ -1191,7 +1503,7 @@ void JOIN_CACHE::reset(bool for_writing)
     The implementation assumes that the function get_curr_link() 
     will return exactly the pointer to this matched record.
 
-  RETURN
+  RETURN VALUE
     TRUE    if it has been decided that it should be the last record
             in the join buffer,
     FALSE   otherwise
@@ -1228,9 +1540,9 @@ bool JOIN_CACHE::put_record()
     point to the beginning of the first field of the record in the
     join buffer.    
 
-  RETURN
-    TRUE  - there are no more records to read from the join buffer
-    FALSE - otherwise
+  RETURN VALUE
+    TRUE    there are no more records to read from the join buffer
+    FALSE   otherwise
 */
 
 bool JOIN_CACHE::get_record()
@@ -1269,7 +1581,7 @@ bool JOIN_CACHE::get_record()
     from the the join buffers of the previous caches. The fields are read
     into the corresponding record buffers.
 
-  RETURN
+  RETURN VALUE
     none
 */
 
@@ -1288,7 +1600,7 @@ void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
 
 
 /* 
-  Test the match flag from the referenced record: the default implementation
+  Get the match flag from the referenced record: the default implementation
 
   SYNOPSIS
     get_match_flag_by_pos()
@@ -1296,30 +1608,55 @@ void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
 
   DESCRIPTION
     This default implementation of the virtual function get_match_flag_by_pos
-    test the match flag for the record pointed by the reference at the position
-    rec_ptr. If the match flag in placed one of the previous buffers the function
-    first reaches the linked record fields in this buffer.
+    get the match flag for the record pointed by the reference at the position
+    rec_ptr. If the match flag is placed in one of the previous buffers the
+    function first reaches the linked record fields in this buffer.
 
-  RETURN
-    TRUE    if the match flag is set on
-    FALSE   otherwise
+  RETURN VALUE
+    match flag for the record at the position rec_ptr
 */
 
-bool JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
+enum JOIN_CACHE::Match_flag JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
 {
+  Match_flag match_fl= MATCH_NOT_FOUND;
   if (with_match_flag)
-    return test(*rec_ptr);
+  {
+    match_fl= (enum Match_flag) rec_ptr[0];
+    return match_fl;
+  }
   if (prev_cache)
   {
     uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
     return prev_cache->get_match_flag_by_pos(prev_rec_ptr);
   } 
   DBUG_ASSERT(0);
-  return FALSE;
+  return match_fl;
 }
 
 
 /* 
+  Calculate the increment of the auxiliary buffer for a record write
+
+  SYNOPSIS
+    aux_buffer_incr()
+      recno   the number of the record the increment to be calculated for
+
+  DESCRIPTION
+    This function calls the aux_buffer_incr the method of the
+    companion member join_tab_scan to calculate the growth of the
+    auxiliary buffer when the recno-th record is added to the
+    join_buffer of this cache.
+
+  RETURN VALUE
+    the number of bytes in the increment 
+*/
+
+uint JOIN_CACHE::aux_buffer_incr(ulong recno)
+{ 
+  return join_tab_scan->aux_buffer_incr(recno);
+}
+
+/* 
   Read all flag and data fields of a record from the join buffer
 
   SYNOPSIS
@@ -1333,8 +1670,8 @@ bool JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
     The function increments the value of 'pos' by the length of the
     read data. 
 
-  RETURN
-    (-1) - if there is no more records in the join buffer
+  RETURN VALUE
+    (-1)   if there is no more records in the join buffer
     length of the data read from the join buffer - otherwise
 */
 
@@ -1372,7 +1709,7 @@ uint JOIN_CACHE::read_all_record_fields()
     The function increments the value of 'pos' by the length of the
     read data. 
 
-  RETURN
+  RETURN VALUE
     length of the data read from the join buffer
 */
 
@@ -1381,6 +1718,12 @@ uint JOIN_CACHE::read_flag_fields()
   uchar *init_pos= pos;
   CACHE_FIELD *copy= field_descr;
   CACHE_FIELD *copy_end= copy+flag_fields;
+  if (with_match_flag)
+  {
+    copy->str[0]= test((Match_flag) pos[0] == MATCH_FOUND);
+    pos+= copy->length;
+    copy++;    
+  } 
   for ( ; copy < copy_end; copy++)
   {
     memcpy(copy->str, pos, copy->length);
@@ -1407,7 +1750,7 @@ uint JOIN_CACHE::read_flag_fields()
     The function increments the value of 'pos' by the length of the
     read data. 
 
-  RETURN
+  RETURN VALUE
     length of the data read from the join buffer
 */
 
@@ -1485,9 +1828,15 @@ uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff)
     If the value of *len is 0 then the function sets it to the total
     length of the record fields including possible trailing offset
     values. Otherwise *len is supposed to provide this value that
-    has been obtained earlier.  
+    has been obtained earlier. 
 
-  RETURN
+  NOTE
+    If the value of the referenced field is null then the offset
+    for the value is set to 0. If the value of a field can be null
+    then the value of flag_fields is always positive. So the offset
+    for any non-null value cannot be 0 in this case. 
+
+  RETURN VALUE
     TRUE   'copy' points to a data descriptor of this join cache
     FALSE  otherwise
 */
@@ -1514,14 +1863,21 @@ bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
                          size_of_fld_ofs*
                          (referenced_fields+1-copy->referenced_field_no));  
   bool is_null= FALSE;
+  Field *field= copy->field;
   if (offset == 0 && flag_fields)
     is_null= TRUE;
   if (is_null)
-    copy->field->set_null();
+  {
+    field->set_null();
+    if (!field->real_maybe_null())
+      field->table->null_row= 1;
+  }
   else
   {
     uchar *save_pos= pos;
-    copy->field->set_notnull(); 
+    field->set_notnull(); 
+    if (!field->real_maybe_null())
+      field->table->null_row= 0;
     pos= rec_ptr+offset;
     read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr));
     pos= save_pos;
@@ -1531,30 +1887,69 @@ bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
    
 
 /* 
-  Skip record from join buffer if its match flag is on: default implementation
+  Skip record from join buffer if's already matched: default implementation
 
   SYNOPSIS
-    skip_record_if_match()
+    skip_if_matched()
 
   DESCRIPTION
-    This default implementation of the virtual function skip_record_if_match
-    skips the next record from the join buffer if its  match flag is set on.
-    If the record is skipped the value of 'pos' is set to points to the position
+    This default implementation of the virtual function skip_if_matched
+    skips the next record from the join buffer if its  match flag is set to 
+    MATCH_FOUND.
+    If the record is skipped the value of 'pos' is set to point to the position
     right after the record.
 
-  RETURN
-    TRUE  - the match flag is on and the record has been skipped
-    FALSE - the match flag is off 
+  RETURN VALUE
+    TRUE   the match flag is set to MATCH_FOUND and the record has been skipped
+    FALSE  otherwise
 */
 
-bool JOIN_CACHE::skip_record_if_match()
+bool JOIN_CACHE::skip_if_matched()
 {
   DBUG_ASSERT(with_length);
   uint offset= size_of_rec_len;
   if (prev_cache)
     offset+= prev_cache->get_size_of_rec_offset();
-  /* Check whether the match flag is on */
-  if (get_match_flag_by_pos(pos+offset))
+  /* Check whether the match flag is MATCH_FOUND */
+  if (get_match_flag_by_pos(pos+offset) == MATCH_FOUND)
+  {
+    pos+= size_of_rec_len + get_rec_length(pos);
+    return TRUE;
+  }
+  return FALSE;
+}      
+
+
+/* 
+  Skip record from join buffer if the match isn't needed: default implementation
+
+  SYNOPSIS
+    skip_if_not_needed_match()
+
+  DESCRIPTION
+    This default implementation of the virtual function skip_if_not_needed_match
+    skips the next record from the join buffer if its match flag is not 
+    MATCH_NOT_FOUND, and, either its value is MATCH_FOUND and join_tab is the
+    first inner table of an inner join, or, its value is MATCH_IMPOSSIBLE
+    and join_tab is the first inner table of an outer join.
+    If the record is skipped the value of 'pos' is set to point to the position
+    right after the record.
+
+  RETURN VALUE
+    TRUE    the record has to be skipped
+    FALSE   otherwise 
+*/
+
+bool JOIN_CACHE::skip_if_not_needed_match()
+{
+  DBUG_ASSERT(with_length);
+  enum Match_flag match_fl;
+  uint offset= size_of_rec_len;
+  if (prev_cache)
+    offset+= prev_cache->get_size_of_rec_offset();
+
+  if ((match_fl= get_match_flag_by_pos(pos+offset)) != MATCH_NOT_FOUND &&
+      (join_tab->check_only_first_match() == (match_fl == MATCH_FOUND)) )
   {
     pos+= size_of_rec_len + get_rec_length(pos);
     return TRUE;
@@ -1618,7 +2013,7 @@ void JOIN_CACHE::restore_last_record()
     that have matches, after which null complementing extension for all
     unmatched records from the join buffer are generated.  
       
-  RETURN
+  RETURN VALUE
     return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
 */ 
 
@@ -1712,16 +2107,16 @@ finish:
 }
 
 
-/*
-  Using BNL find matches from the next table for records from the join buffer   
+/*   
+  Find matches from the next table for records from the join buffer 
 
   SYNOPSIS
     join_matching_records()
       skip_last    do not look for matches for the last partial join record 
 
   DESCRIPTION
-    The function retrieves all rows of the join_tab table and check whether
-    they match partial join records from the join buffer. If a match is found
+    The function retrieves rows of the join_tab table and checks whether they
+    match partial join records from the join buffer. If a match is found
     the function will call the sub_select function trying to look for matches
     for the remaining join operations.
     This function currently is called only from the function join_records.    
@@ -1730,27 +2125,46 @@ finish:
     the future processing in the caller function.
 
   NOTES
+    If employed by BNL or BNLH join algorithms the function performs a full
+    scan of join_tab for each refill of the join buffer. If BKA or BKAH
+    algorithms are used then the function iterates only over those records
+    from join_tab that can be accessed by keys built over records in the join
+    buffer. To apply a proper method of iteration the function just calls
+    virtual iterator methods (open, next, close) of the member join_tab_scan.
+    The member can be either of the JOIN_TAB_SCAN or JOIN_TAB_SCAN_MMR type.
+    The class JOIN_TAB_SCAN provides the iterator methods for BNL/BNLH join
+    algorithms. The class JOIN_TAB_SCAN_MRR provides the iterator methods
+    for BKA/BKAH join algorithms.
+    When the function looks for records from the join buffer that would
+    match a record from join_tab it iterates either over all records in
+    the buffer or only over selected records. If BNL join operation is
+    performed all records are checked for the match. If BNLH or BKAH
+    algorithm is employed to join join_tab then the function looks only
+    through the records with the same join key as the record from join_tab.
+    With the BKA join algorithm only one record from the join buffer is checked
+    for a match for any record from join_tab. To iterate over the candidates
+    for a match the virtual function get_next_candidate_for_match is used,
+    while the virtual function prepare_look_for_matches is called to prepare
+    for such iteration proccess.     
+
+  NOTES
     The function produces all matching extensions for the records in the 
-    join buffer following the path of the Blocked Nested Loops algorithm. 
+    join buffer following the path of the employed blocked algorithm. 
     When an outer join operation is performed all unmatched records from
     the join buffer must be extended by null values. The function 
     'join_null_complements' serves this purpose.  
       
-  RETURN
-    return one of enum_nested_loop_state.
+  RETURN VALUE
+    return one of enum_nested_loop_state
 */ 
 
-enum_nested_loop_state JOIN_CACHE_BNL::join_matching_records(bool skip_last)
+enum_nested_loop_state JOIN_CACHE::join_matching_records(bool skip_last)
 {
-  uint cnt;
   int error;
-  JOIN_TAB *tab;
-  READ_RECORD *info;
   enum_nested_loop_state rc= NESTED_LOOP_OK;
-  bool check_only_first_match= join_tab->check_only_first_match();
-  SQL_SELECT *select= join_tab->cache_select;
-
   join_tab->table->null_row= 0;
+  bool check_only_first_match= join_tab->check_only_first_match();
+  bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
 
   /* Return at once if there are no records in the join buffer */
   if (!records)     
@@ -1772,25 +2186,21 @@ enum_nested_loop_state JOIN_CACHE_BNL::join_matching_records(bool skip_last)
     join_tab->select->quick= 0;
   }
 
-  for (tab= join->join_tab; tab != join_tab ; tab++)
-  {
-    tab->status= tab->table->status;
-    tab->table->status= 0;
-  }
+  if ((rc= join_tab_execution_startup(join_tab)) < 0)
+    goto finish2;
 
-  /* Start retrieving all records of the joined table */
-  if ((error= join_init_read_record(join_tab))) 
-  {
-    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
+  /* Prepare to retrieve all records of the joined table */
+  if ((error= join_tab_scan->open()))
+  { 
+    /* 
+      TODO: if we get here, we will assert in net_send_statement(). Add test
+      coverage and fix.
+    */
     goto finish;
   }
-
-  info= &join_tab->read_record;
-  do
+  
+  while (!(error= join_tab_scan->next()))   
   {
-    if (join_tab->keep_current_rowid)
-      join_tab->table->file->position(join_tab->table->record[0]);
-
     if (join->thd->killed)
     {
       /* The user has aborted the execution of the query */
@@ -1798,52 +2208,44 @@ enum_nested_loop_state JOIN_CACHE_BNL::join_matching_records(bool skip_last)
       rc= NESTED_LOOP_KILLED;
       goto finish; 
     }
-    int err= 0;
 
-     if (rc == NESTED_LOOP_OK)
-       update_virtual_fields(join->thd, join_tab->table);
- 
-    /* 
-      Do not look for matches if the last read record of the joined table
-      does not meet the conditions that have been pushed to this table
-    */
-    if (rc == NESTED_LOOP_OK && 
-        (!select || (err= select->skip_record(join->thd)) != 0))
-    {
-      if (err < 0)
-        return NESTED_LOOP_ERROR;
-      rc= NESTED_LOOP_OK;
-
-      /* Prepare to read records from the join buffer */
-      reset(FALSE);
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);
+    
+    /* Prepare to read matching candidates from the join buffer */
+    if (prepare_look_for_matches(skip_last))
+      continue;
 
-      /* Read each record from the join buffer and look for matches */
-      for (cnt= records - test(skip_last) ; cnt; cnt--)
-      { 
-        /* 
-          If only the first match is needed and it has been already found for
-          the next record read from the join buffer then the record is skipped.
-	*/
-        if (!check_only_first_match || !skip_record_if_match())
-        {
-	  get_record();
-          rc= generate_full_extensions(get_curr_rec());
-          if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-	    goto finish;   
-        }
+    uchar *rec_ptr;
+    /* Read each possible candidate from the buffer and look for matches */
+    while ((rec_ptr= get_next_candidate_for_match()))
+    { 
+      /* 
+        If only the first match is needed, and, it has been already found for
+        the next record read from the join buffer, then the record is skipped.
+        Also those records that must be null complemented are not considered
+        as candidates for matches.
+      */
+      if ((!check_only_first_match && !outer_join_first_inner) ||
+          !skip_next_candidate_for_match(rec_ptr))
+      {
+	read_next_candidate_for_match(rec_ptr);
+        rc= generate_full_extensions(rec_ptr);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+	  goto finish;   
       }
     }
-  } while (!(error= info->read_record(info)));
+  }
 
-  if (error > 0)				// Fatal error
-    rc= NESTED_LOOP_ERROR; 
-finish:                  
-  for (tab= join->join_tab; tab != join_tab ; tab++)
-    tab->table->status= tab->status;
+finish: 
+  if (error)                 
+    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
+finish2:    
+  join_tab_scan->close();
   return rc;
 }
 
-     
+
 /*
   Set match flag for a record in join buffer if it has not been set yet    
 
@@ -1863,7 +2265,7 @@ finish:
     The function assumes that the match flag for any record in any cache
     is placed in the first byte occupied by the record fields. 
 
-  RETURN
+  RETURN VALUE
     TRUE   the match flag is set by this call for the first time
     FALSE  the match flag has been set before this call
 */ 
@@ -1892,9 +2294,9 @@ bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
     DBUG_ASSERT(cache);
     rec_ptr= cache->get_rec_ref(rec_ptr);
   } 
-  if (rec_ptr[0] == 0)
+  if ((Match_flag) rec_ptr[0] != MATCH_FOUND)
   {
-    rec_ptr[0]= 1;
+    rec_ptr[0]= MATCH_FOUND;
     first_inner->found= 1;
     return TRUE;  
   }
@@ -1915,7 +2317,7 @@ bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
     case the function calls the join_tab->next_select method to generate
     all full extension for this partial join match.
       
-  RETURN
+  RETURN VALUE
     return one of enum_nested_loop_state.
 */ 
 
@@ -1970,7 +2372,7 @@ enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
     Setting the match flag on can trigger re-evaluation of pushdown conditions
     for the record when join_tab is the last inner table of an outer join.
       
-  RETURN
+  RETURN VALUE
     TRUE   there is a match
     FALSE  there is no match
 */ 
@@ -1978,7 +2380,7 @@ enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
 inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
 {
   /* Check whether pushdown conditions are satisfied */
-  if (join_tab->select && join_tab->select->skip_record(join->thd) < 1)
+  if (join_tab->select && join_tab->select->skip_record(join->thd) <= 0)
     return FALSE;
 
   if (!join_tab->is_last_inner_table())
@@ -2008,7 +2410,7 @@ inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
     */      
     for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
     {
-      if (tab->select && tab->select->skip_record(join->thd) < 1)
+      if (tab->select && tab->select->skip_record(join->thd) <= 0)
         return FALSE;
     }
   }
@@ -2039,18 +2441,17 @@ inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
 
   NOTES
     The same implementation of the virtual method join_null_complements
-    is used for JOIN_CACHE_BNL and JOIN_CACHE_BKA.
+    is used for BNL/BNLH/BKA/BKA join algorthm.
       
-  RETURN
+  RETURN VALUE
     return one of enum_nested_loop_state.
 */ 
 
 enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
 {
-  uint cnt; 
+  ulonglong cnt; 
   enum_nested_loop_state rc= NESTED_LOOP_OK;
   bool is_first_inner= join_tab == join_tab->first_unmatched;
-  bool is_last_inner= join_tab == join_tab->first_unmatched->last_inner;
  
   /* Return at once if there are no records in the join buffer */
   if (!records)
@@ -2071,40 +2472,16 @@ enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
       goto finish;
     }
     /* Just skip the whole record if a match for it has been already found */
-    if (!is_first_inner || !skip_record_if_match())
+    if (!is_first_inner || !skip_if_matched())
     {
       get_record();
       /* The outer row is complemented by nulls for each inner table */
       restore_record(join_tab->table, s->default_values);
       mark_as_null_row(join_tab->table);  
-      /* Check all pushdown conditions attached to the inner table */
-      join_tab->first_unmatched->found= 1;
-      if (join_tab->select && join_tab->select->skip_record(join->thd) < 1)
-        continue;
-      if (is_last_inner)
-      { 
-        JOIN_TAB *first_upper= join_tab->first_unmatched->first_upper;
-        while (first_upper && first_upper->last_inner == join_tab)
-        {
-          set_match_flag_if_none(first_upper, get_curr_rec());
-          for (JOIN_TAB* tab= first_upper; tab <= join_tab; tab++)
-          {
-            if (tab->select && tab->select->skip_record(join->thd) < 1)
-              goto next;
-          }
-          first_upper= first_upper->first_upper;
-        }
-      }
-      /* Find all matches for the remaining join tables */
-      rc= (*join_tab->next_select)(join, join_tab+1, 0);
+      rc= generate_full_extensions(get_curr_rec());
       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-      {
-        reset(TRUE);
         goto finish;
-      }
     }
-  next:
-    ;
   }
 
 finish:
@@ -2113,475 +2490,182 @@ finish:
 
 
 /*
-  Initialize retrieval of range sequence for BKA algorithm
-    
-  SYNOPSIS
-    bka_range_seq_init()
-     init_params   pointer to the BKA join cache object
-     n_ranges      the number of ranges obtained 
-     flags         combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+  Add a comment on the join algorithm employed by the join cache 
 
-  DESCRIPTION
-    The function interprets init_param as a pointer to a JOIN_CACHE_BKA
-    object. The function prepares for an iteration over the join keys
-    built for all records from the cache join buffer.
-
-  NOTE
-    This function are used only as a callback function.    
-
-  RETURN
-    init_param value that is to be used as a parameter of bka_range_seq_next()
-*/    
-
-static 
-range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
-{
-  DBUG_ENTER("bka_range_seq_init");
-  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
-  cache->reset(0);
-  DBUG_RETURN((range_seq_t) init_param);
-}
-
-
-/*
-  Get the key over the next record from the join buffer used by BKA  
-    
   SYNOPSIS
-    bka_range_seq_next()
-      seq    the value returned by  bka_range_seq_init
-      range  OUT reference to the next range
-  
-  DESCRIPTION
-    The function interprets seq as a pointer to a JOIN_CACHE_BKA
-    object. The function returns a pointer to the range descriptor
-    for the key built over the next record from the join buffer.
+    print_explain_comment()
+      str  string to add the comment on the employed join algorithm to
 
-  NOTE
-    This function are used only as a callback function.
-   
-  RETURN
-    0   ok, the range structure filled with info about the next key
-    1   no more ranges
-*/    
-
-static 
-uint bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
-{
-  DBUG_ENTER("bka_range_seq_next");
-  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
-  TABLE_REF *ref= &cache->join_tab->ref;
-  key_range *start_key= &range->start_key;
-  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
-  {
-    start_key->keypart_map= (1 << ref->key_parts) - 1;
-    start_key->flag= HA_READ_KEY_EXACT;
-    range->end_key= *start_key;
-    range->end_key.flag= HA_READ_AFTER_KEY;
-    range->ptr= (char *) cache->get_curr_rec();
-    range->range_flag= EQ_RANGE;
-    DBUG_RETURN(0);
-  } 
-  DBUG_RETURN(1);
-}
-
-
-/*
-  Check whether range_info orders to skip the next record from BKA buffer
-
-  SYNOPSIS
-    bka_range_seq_skip_record()
-      seq              value returned by bka_range_seq_init()
-      range_info       information about the next range
-      rowid [NOT USED] rowid of the record to be checked 
-
-    
   DESCRIPTION
-    The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
-    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE
-    object. The function returns TRUE if the record with this range_info
-    is to be filtered out from the stream of records returned by
-    multi_range_read_next(). 
+    This function adds info on the type of the used join buffer (flat or
+    incremental) and on the type of the the employed join algorithm (BNL,
+    BNLH, BKA or BKAH) to the the end of the sring str.
 
-  NOTE
-    This function are used only as a callback function.
-
-  RETURN
-    1    record with this range_info is to be filtered out from the stream
-         of records returned by multi_range_read_next()
-    0    the record is to be left in the stream
+  RETURN VALUE
+    none
 */ 
 
-static 
-bool bka_range_seq_skip_record(range_seq_t rseq, char *range_info, uchar *rowid)
+void JOIN_CACHE::print_explain_comment(String *str)
 {
-  DBUG_ENTER("bka_range_seq_skip_record");
-  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
-  bool res= cache->get_match_flag_by_pos((uchar *) range_info);
-  DBUG_RETURN(res);
-}
-
-/*
-  Using BKA find matches from the next table for records from the join buffer   
-
-  SYNOPSIS
-    join_matching_records()
-      skip_last    do not look for matches for the last partial join record 
+  str->append(STRING_WITH_LEN(" ("));
+  const char *buffer_type= prev_cache ? "incremental" : "flat";
+  str->append(buffer_type);
+  str->append(STRING_WITH_LEN(", "));
+  
+  const char *join_alg="";
+  switch (get_join_alg()) {
+  case BNL_JOIN_ALG:
+    join_alg= "BNL";
+    break;
+  case BNLH_JOIN_ALG:
+    join_alg= "BNLH";
+    break;
+  case BKA_JOIN_ALG:
+    join_alg= "BKA";
+    break;
+  case BKAH_JOIN_ALG:
+    join_alg= "BKAH";
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
 
-  DESCRIPTION
-    This function can be used only when the table join_tab can be accessed
-    by keys built over the fields of previous join tables.
-    The function retrieves all partial join records from the join buffer and
-    for each of them builds the key value to access join_tab, performs index
-    look-up with this key and selects matching records yielded by this look-up
-    If a match is found the function will call the sub_select function trying
-    to look for matches for the remaining join operations.
-    This function currently is called only from the function join_records.    
-    It's assumed that this function is always called with the skip_last 
-    parameter equal to false.
+  str->append(join_alg);
+  str->append(STRING_WITH_LEN(" join"));
+  str->append(STRING_WITH_LEN(")"));
+}
 
-  NOTES
-    The function produces all matching extensions for the records in the 
-    join buffer following the path of the Batched Key Access algorithm. 
-    When an outer join operation is performed all unmatched records from
-    the join buffer must be extended by null values. The function 
-    join_null_complements serves this purpose.
-    The Batched Key Access algorithm assumes that key accesses are batched.
-    In other words it assumes that, first, either keys themselves or the
-    corresponding rowids (primary keys) are accumulated in a buffer, then
-    data rows from  join_tab are fetched for all of them. When a row is
-    fetched it is always returned with a reference to the key by which it
-    has been accessed.
-    When key values are batched we can save on the number of the server 
-    requests for index lookups. For the remote engines, like NDB cluster, it
-    essentially reduces the number of round trips between the server and
-    the engine when performing a join operation. 
-    When the rowids for the keys are batched we can optimize the order
-    in what we fetch the data for this rowids. The performance benefits of
-    this optimization can be significant for such engines as MyISAM, InnoDB.
-    What is exactly batched are hidden behind implementations of
-    MRR handler interface that is supposed to be appropriately chosen
-    for each engine. If for a engine no specific implementation of the MRR
-    interface is supllied then the default implementation is used. This
-    implementation actually follows the path of Nested Loops Join algorithm.
-    In this case BKA join surely will demonstrate a worse performance than
-    NL join. 
-            
-  RETURN
-    return one of enum_nested_loop_state
-*/
 
-enum_nested_loop_state JOIN_CACHE_BKA::join_matching_records(bool skip_last)
+static void add_mrr_explain_info(String *str, uint mrr_mode, handler *file)
 {
-  int error;
-  handler *file= join_tab->table->file;
-  enum_nested_loop_state rc= NESTED_LOOP_OK;
-  uchar *rec_ptr= 0;
-  bool check_only_first_match= join_tab->check_only_first_match();
-
-  /* Set functions to iterate over keys in the join buffer */
-
-  RANGE_SEQ_IF seq_funcs= { bka_range_seq_init, 
-                            bka_range_seq_next,
-                            check_only_first_match ?
-                              bka_range_seq_skip_record : 0,
-                            join_tab->cache_idx_cond ?
-                              bka_skip_index_tuple : 0 };
-
-  /* The value of skip_last must be always FALSE when this function is called */
-  DBUG_ASSERT(!skip_last);
-
-  /* Return at once if there are no records in the join buffer */
-  if (!records)
-    return NESTED_LOOP_OK;  
-                   
-  rc= init_join_matching_records(&seq_funcs, records);
-  if (rc != NESTED_LOOP_OK)
-    goto finish;
-
-  while (!(error= file->multi_range_read_next((char **) &rec_ptr)))
+  char mrr_str_buf[128]={0};
+  int len;
+  len= file->multi_range_read_explain_info(mrr_mode, mrr_str_buf,
+                                           sizeof(mrr_str_buf));
+  if (len > 0)
   {
-    if (join->thd->killed)
-    {
-      /* The user has aborted the execution of the query */
-      join->thd->send_kill_message();
-      rc= NESTED_LOOP_KILLED; 
-      goto finish;
-    }
-    if (join_tab->keep_current_rowid)
-      join_tab->table->file->position(join_tab->table->record[0]);
-    /* 
-      If only the first match is needed and it has been already found 
-      for the associated partial join record then the returned candidate
-      is discarded.
-    */
-    if (rc == NESTED_LOOP_OK &&
-        (!check_only_first_match || !get_match_flag_by_pos(rec_ptr)))
-    {
-      get_record_by_pos(rec_ptr);
-      update_virtual_fields(join->thd, join_tab->table);
-      rc= generate_full_extensions(rec_ptr);
-      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-	goto finish;   
-    }
+    str->append(STRING_WITH_LEN("; "));
+    str->append(mrr_str_buf, len);
   }
-
-  if (error > 0 && error != HA_ERR_END_OF_FILE)	   
-    return NESTED_LOOP_ERROR; 
-finish:                  
-  return end_join_matching_records(rc);
 }
 
 
-
-/* 
-  Prepare to search for records that match records from the join buffer
-
-  SYNOPSIS
-    init_join_matching_records()
-      seq_funcs    structure of range sequence interface
-      ranges       number of keys/ranges in the sequence
-
-  DESCRIPTION
-    This function calls the multi_range_read_init function to set up
-    the BKA process of generating the keys from the records in the join
-    buffer and looking for matching records from the table to be joined.
-    The function passes as a parameter a structure of functions that
-    implement the range sequence interface. This interface is used to
-    enumerate all generated keys and optionally to filter the matching
-    records returned by the multi_range_read_next calls from the
-    intended invocation of the join_matching_records method. The
-    multi_range_read_init function also receives the parameters for
-    MRR buffer to be used and flags specifying the mode in which
-    this buffer will be functioning.
-    The number of keys in the sequence expected by multi_range_read_init
-    is passed through the parameter ranges.  
-    
-  RETURN
-    return one of enum_nested_loop_state
-*/
-
-enum_nested_loop_state 
-JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges)
+void JOIN_CACHE_BKA::print_explain_comment(String *str)
 {
-  int error;
-  handler *file= join_tab->table->file;
-  enum_nested_loop_state rc= NESTED_LOOP_OK;
-
-  join_tab->table->null_row= 0;
-
-
-  /* Dynamic range access is never used with BKA */
-  DBUG_ASSERT(join_tab->use_quick != 2);
-
-  for (JOIN_TAB *tab =join->join_tab; tab != join_tab ; tab++)
-  {
-    tab->status= tab->table->status;
-    tab->table->status= 0;
-  }
+  JOIN_CACHE::print_explain_comment(str); 
+  add_mrr_explain_info(str, mrr_mode, join_tab->table->file);
+}
 
-  init_mrr_buff();
 
-  /* 
-    Prepare to iterate over keys from the join buffer and to get
-    matching candidates obtained with MMR handler functions.
-  */ 
-  if (!file->inited)
-    file->ha_index_init(join_tab->ref.key, 1);
-  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges,
-					  mrr_mode, &mrr_buff)))
-    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
-  
-  return rc;
+void JOIN_CACHE_BKAH::print_explain_comment(String *str)
+{
+  JOIN_CACHE::print_explain_comment(str); 
+  add_mrr_explain_info(str, mrr_mode, join_tab->table->file);
 }
 
 
 /* 
-  Finish searching for records that match records from the join buffer
+  Initialize a hashed join cache       
 
   SYNOPSIS
-    end_join_matching_records()
-      rc      return code passed by the join_matching_records function
+    init()
 
   DESCRIPTION
-    This function perform final actions on searching for all matches for
-    the records from the join buffer and building all full join extensions
-    of the records with these matches. 
-    
-  RETURN
-    return code rc passed to the function as a parameter
+    The function initializes the cache structure with a hash table in it.
+    The hash table will be used to store key values for the records from
+    the join buffer.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+    The function also initializes a hash table for record keys within the join
+    buffer space.
+
+  NOTES VALUE
+    The function is supposed to be called by the init methods of the classes 
+    derived from JOIN_CACHE_HASHED.
+  
+  RETURN VALUE
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
 */
 
-enum_nested_loop_state 
-JOIN_CACHE_BKA::end_join_matching_records(enum_nested_loop_state rc)
+int JOIN_CACHE_HASHED::init()
 {
-  for (JOIN_TAB *tab=join->join_tab; tab != join_tab ; tab++)
-    tab->table->status= tab->status;
-  return rc;  
-}
+  int rc= 0;
+  TABLE_REF *ref= &join_tab->ref;
 
+  DBUG_ENTER("JOIN_CACHE_HASHED::init");
 
-/* 
-  Get the key built over the next record from BKA join buffer
-
-  SYNOPSIS
-    get_next_key()
-      key    pointer to the buffer where the key value is to be placed
+  hash_table= 0;
+  key_entries= 0;
 
-  DESCRIPTION
-    The function reads key fields from the current record in the join buffer.
-    and builds the key value out of these fields that will be used to access
-    the 'join_tab' table. Some of key fields may belong to previous caches.
-    They are accessed via record references to the record parts stored in the
-    previous join buffers. The other key fields always are placed right after
-    the flag fields of the record.
-    If the key is embedded, which means that its value can be read directly
-    from the join buffer, then *key is set to the beginning of the key in
-    this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
-    The function returns the length of the key if it succeeds ro read it.
-    If is assumed that the functions starts reading at the position of
-    the record length which is provided for each records in a BKA cache.
-    After the key is built the 'pos' value points to the first position after
-    the current record. 
-    The function returns 0 if the initial position is after the beginning
-    of the record fields for last record from the join buffer. 
+  key_length= ref->key_length;
 
-  RETURN
-    length of the key value - if the starting value of 'pos' points to
-    the position before the fields for the last record,
-    0 - otherwise.     
-*/
+  if ((rc= JOIN_CACHE::init()))
+    DBUG_RETURN (rc);
 
-uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
-{
-  uint len;
-  uint32 rec_len;
-  uchar *init_pos;
-  JOIN_CACHE *cache;
-  
-  if (pos > last_rec_pos || !records)
-    return 0;
+  if (!(key_buff= (uchar*) sql_alloc(key_length)))
+    DBUG_RETURN(1);
 
-  /* Any record in a BKA cache is prepended with its length */
-  DBUG_ASSERT(with_length);
-   
-  /* Read the length of the record */
-  rec_len= get_rec_length(pos);
-  pos+= size_of_rec_len; 
-  init_pos= pos;
+  /* Take into account a reference to the next record in the key chain */
+  pack_length+= get_size_of_rec_offset(); 
+  pack_length_with_blob_ptrs+= get_size_of_rec_offset();
 
-  /* Read a reference to the previous cache if any */
-  if (prev_cache)
-    pos+= prev_cache->get_size_of_rec_offset();
+  ref_key_info= join_tab->get_keyinfo_by_key_no(join_tab->ref.key);
+  ref_used_key_parts= join_tab->ref.key_parts;
 
-  curr_rec_pos= pos;
+  hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
+  hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
 
-  /* Read all flag fields of the record */
-  read_flag_fields();
- 
-  if (use_emb_key)
-  {
-    /* An embedded key is taken directly from the join buffer */
-    *key= pos;
-    len= emb_key_length;
-  }
-  else
+  KEY_PART_INFO *key_part= ref_key_info->key_part;
+  KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
+  for ( ; key_part < key_part_end; key_part++)
   {
-    /* Read key arguments from previous caches if there are any such fields */
-    if (external_key_arg_fields)
+    if (!key_part->field->eq_cmp_as_binary())
     {
-      uchar *rec_ptr= curr_rec_pos;
-      uint key_arg_count= external_key_arg_fields;
-      CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
-      for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
-      { 
-        uint len= 0;
-        DBUG_ASSERT(cache);
-        rec_ptr= cache->get_rec_ref(rec_ptr);
-        while (!cache->referenced_fields)
-        {
-          cache= cache->prev_cache;
-          DBUG_ASSERT(cache);
-          rec_ptr= cache->get_rec_ref(rec_ptr);
-        }
-        while (key_arg_count && 
-               cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
-        {
-          copy_ptr++;
-          --key_arg_count;
-        }
-      }
+      hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
+      hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
+      break;
     }
-    
-    /* 
-      Read the other key arguments from the current record. The fields for
-      these arguments are always first in the sequence of the record's fields.
-    */     
-    CACHE_FIELD *copy= field_descr+flag_fields;
-    CACHE_FIELD *copy_end= copy+local_key_arg_fields;
-    bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
-    for ( ; copy < copy_end; copy++)
-      read_record_field(copy, blob_in_rec_buff);
-    
-    /* Build the key over the fields read into the record buffers */ 
-    TABLE_REF *ref= &join_tab->ref;
-    cp_buffer_from_ref(join->thd, join_tab->table, ref);
-    *key= ref->key_buff;
-    len= ref->key_length;
   }
+      
+  init_hash_table();
 
-  pos= init_pos+rec_len;
+  rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
+                     (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
 
-  return len;
-} 
+  data_fields_offset= 0;
+  if (use_emb_key)
+  {
+    CACHE_FIELD *copy= field_descr;
+    CACHE_FIELD *copy_end= copy+flag_fields;
+    for ( ; copy < copy_end; copy++)
+      data_fields_offset+= copy->length;
+  } 
+
+  DBUG_RETURN(rc);
+}
 
 
 /* 
-  Initialize a BKA_UNIQUE cache       
+  Initialize the hash table of a hashed join cache 
 
   SYNOPSIS
-    init()
+    init_hash_table()
 
   DESCRIPTION
-    The function initializes the cache structure. It supposed to be called
-    right after a constructor for the JOIN_CACHE_BKA_UNIQUE.
-    The function allocates memory for the join buffer and for descriptors of
-    the record fields stored in the buffer.
-    The function also estimates the number of hash table entries in the hash
-    table to be used and initializes this hash table.
+    The function estimates the number of hash table entries in the hash
+    table to be used and initializes this hash table within the join buffer
+    space.
 
-  NOTES
-    The code of this function should have been included into the constructor
-    code itself. However the new operator for the class JOIN_CACHE_BKA_UNIQUE
-    would never fail while memory allocation for the join buffer is not 
-    absolutely unlikely to fail. That's why this memory allocation has to be
-    placed in a separate function that is called in a couple with a cache 
-    constructor.
-    It is quite natural to put almost all other constructor actions into
-    this function.     
-  
-  RETURN
-    0   initialization with buffer allocations has been succeeded
-    1   otherwise
+  RETURN VALUE
+    Currently the function always returns 0;
 */
 
-int JOIN_CACHE_BKA_UNIQUE::init()
+int JOIN_CACHE_HASHED::init_hash_table()
 {
-  int rc= 0;
-  TABLE_REF *ref= &join_tab->ref;
-  
-  DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::init");
-
   hash_table= 0;
   key_entries= 0;
 
-  if ((rc= JOIN_CACHE_BKA::init()))
-    DBUG_RETURN (rc);
-
-  key_length= ref->key_length;
-
-  /* Take into account a reference to the next record in the key chain */
-  pack_length+= get_size_of_rec_offset(); 
- 
   /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */
   uint max_size_of_key_ofs= max(2, get_size_of_rec_offset());  
   for (size_of_key_ofs= 2;
@@ -2592,7 +2676,10 @@ int JOIN_CACHE_BKA_UNIQUE::init()
                       size_of_key_ofs +          // reference to the next key 
                       (use_emb_key ?  get_size_of_rec_offset() : key_length);
 
-    uint n= buff_size / (pack_length+key_entry_length+size_of_key_ofs);
+    ulong space_per_rec= avg_record_length +
+                         avg_aux_buffer_incr +
+                         key_entry_length+size_of_key_ofs;
+    uint n= buff_size / space_per_rec;
 
     /*
       TODO: Make a better estimate for this upper bound of
@@ -2602,6 +2689,7 @@ int JOIN_CACHE_BKA_UNIQUE::init()
                              key_entry_length+size_of_key_ofs);
 
     hash_entries= (uint) (n / 0.7);
+    set_if_bigger(hash_entries, 1);
     
     if (offset_size(max_n*key_entry_length) <=
         size_of_key_ofs)
@@ -2613,27 +2701,75 @@ int JOIN_CACHE_BKA_UNIQUE::init()
   cleanup_hash_table();
   curr_key_entry= hash_table;
 
-  pack_length+= key_entry_length;
-  pack_length_with_blob_ptrs+= get_size_of_rec_offset() + key_entry_length;
+  return 0;
+}
 
-  rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
-                     (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
 
-  data_fields_offset= 0;
-  if (use_emb_key)
-  {
-    CACHE_FIELD *copy= field_descr;
-    CACHE_FIELD *copy_end= copy+flag_fields;
-    for ( ; copy < copy_end; copy++)
-      data_fields_offset+= copy->length;
-  } 
+/*
+  Reallocate the join buffer of a hashed join cache
+ 
+  SYNOPSIS
+    realloc_buffer()
 
-  DBUG_RETURN(rc);
+  DESCRITION
+    The function reallocates the join buffer of the hashed join cache.
+    After this it initializes a hash table within the buffer space and
+    resets the join cache for writing.
+
+  NOTES
+    The function assumes that buff_size contains the new value for the join
+    buffer size.  
+
+  RETURN VALUE
+    0   if the buffer has been successfully reallocated
+    1   otherwise
+*/
+
+int JOIN_CACHE_HASHED::realloc_buffer()
+{
+  int rc;
+  free();
+  rc= test(!(buff= (uchar*) my_malloc(buff_size, MYF(0))));
+  init_hash_table();
+  reset(TRUE);
+  return rc;   	
 }
 
 
+/*
+  Get maximum size of the additional space per record used for record keys
+
+  SYNOPSYS
+    get_max_key_addon_space_per_record()
+  
+  DESCRIPTION
+    The function returns the size of the space occupied by one key entry
+    and one hash table entry.
+
+  RETURN VALUE
+    maximum size of the additional space per record that is used to store
+    record keys in the hash table
+*/
+
+uint JOIN_CACHE_HASHED::get_max_key_addon_space_per_record()
+{
+  ulong len;
+  TABLE_REF *ref= &join_tab->ref;
+  /* 
+    The total number of hash entries in the hash tables is bounded by
+    ceiling(N/0.7) where N is the maximum number of records in the buffer.
+    That's why the multiplier 2 is used in the formula below. 
+  */ 
+  len= (use_emb_key ?  get_size_of_rec_offset() : ref->key_length) +
+        size_of_rec_ofs +    // size of the key chain header
+        size_of_rec_ofs +    // >= size of the reference to the next key 
+        2*size_of_rec_ofs;   // >= 2*( size of hash table entry)
+  return len; 
+}    
+
+
 /* 
-  Reset the JOIN_CACHE_BKA_UNIQUE  buffer for reading/writing
+  Reset the buffer of a hashed join cache for reading/writing
 
   SYNOPSIS
     reset()
@@ -2641,15 +2777,15 @@ int JOIN_CACHE_BKA_UNIQUE::init()
 
   DESCRIPTION
     This implementation of the virtual function reset() resets the join buffer
-    of the JOIN_CACHE_BKA_UNIQUE class for reading or writing.
+    of the JOIN_CACHE_HASHED class for reading or writing.
     Additionally to what the default implementation does this function
     cleans up the hash table allocated within the buffer.  
     
-  RETURN
+  RETURN VALUE
     none
 */
  
-void JOIN_CACHE_BKA_UNIQUE::reset(bool for_writing)
+void JOIN_CACHE_HASHED::reset(bool for_writing)
 {
   this->JOIN_CACHE::reset(for_writing);
   if (for_writing && hash_table)
@@ -2657,15 +2793,16 @@ void JOIN_CACHE_BKA_UNIQUE::reset(bool for_writing)
   curr_key_entry= hash_table;
 }
 
+
 /* 
-  Add a record into the JOIN_CACHE_BKA_UNIQUE buffer
+  Add a record into the buffer of a hashed join cache
 
   SYNOPSIS
     put_record()
 
   DESCRIPTION
     This implementation of the virtual function put_record writes the next
-    matching record into the join buffer of the JOIN_CACHE_BKA_UNIQUE class.
+    matching record into the join buffer of the JOIN_CACHE_HASHED class.
     Additionally to what the default implementation does this function
     performs the following. 
     It extracts from the record the key value used in lookups for matching
@@ -2676,14 +2813,16 @@ void JOIN_CACHE_BKA_UNIQUE::reset(bool for_writing)
     is attached to the key entry. The key value is either placed in the hash 
     element added for the key or, if the use_emb_key flag is set, remains in
     the record from the partial join.
+    If the match flag field of a record contains MATCH_IMPOSSIBLE the key is
+    not created for this record. 
     
-  RETURN
+  RETURN VALUE
     TRUE    if it has been decided that it should be the last record
             in the join buffer,
     FALSE   otherwise
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::put_record()
+bool JOIN_CACHE_HASHED::put_record()
 {
   bool is_full;
   uchar *key;
@@ -2699,6 +2838,9 @@ bool JOIN_CACHE_BKA_UNIQUE::put_record()
     link= prev_cache->get_curr_rec_link();
   write_record_data(link, &is_full);
 
+  if (last_written_is_null_compl)
+    return is_full;    
+
   if (use_emb_key)
     key= get_curr_emb_key();
   else
@@ -2752,6 +2894,7 @@ bool JOIN_CACHE_BKA_UNIQUE::put_record()
       memcpy(cp, key, key_len);
     }
     last_key_entry= cp;
+    DBUG_ASSERT(last_key_entry >= end_pos);
     /* Increment the counter of key_entries in the hash table */ 
     key_entries++;
   }  
@@ -2760,7 +2903,7 @@ bool JOIN_CACHE_BKA_UNIQUE::put_record()
 
 
 /*
-  Read the next record from the JOIN_CACHE_BKA_UNIQUE buffer
+  Read the next record from the buffer of a hashed join cache
 
   SYNOPSIS
     get_record()
@@ -2770,12 +2913,12 @@ bool JOIN_CACHE_BKA_UNIQUE::put_record()
     function get_record does this implementation skips the link element
     used to connect the records with the same key into a chain. 
 
-  RETURN
-    TRUE  - there are no more records to read from the join buffer
-    FALSE - otherwise
+  RETURN VALUE
+    TRUE    there are no more records to read from the join buffer
+    FALSE   otherwise
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::get_record()
+bool JOIN_CACHE_HASHED::get_record()
 { 
   pos+= get_size_of_rec_offset();
   return this->JOIN_CACHE::get_record();
@@ -2783,26 +2926,55 @@ bool JOIN_CACHE_BKA_UNIQUE::get_record()
 
 
 /* 
-  Skip record from the JOIN_CACHE_BKA_UNIQUE join buffer if its match flag is on
+  Skip record from a hashed join buffer if its match flag is set to MATCH_FOUND
 
   SYNOPSIS
-    skip_record_if_match()
+    skip_if_matched()
 
   DESCRIPTION
-    This implementation of the virtual function skip_record_if_match does
+    This implementation of the virtual function skip_if_matched does
     the same as the default implementation does, but it takes into account
     the link element used to connect the records with the same key into a chain. 
 
-  RETURN
-    TRUE  - the match flag is on and the record has been skipped
-    FALSE - the match flag is off 
+  RETURN VALUE
+    TRUE    the match flag is MATCH_FOUND  and the record has been skipped
+    FALSE   otherwise 
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::skip_record_if_match()
+bool JOIN_CACHE_HASHED::skip_if_matched()
 {
   uchar *save_pos= pos;
   pos+= get_size_of_rec_offset();
-  if (!this->JOIN_CACHE::skip_record_if_match())
+  if (!this->JOIN_CACHE::skip_if_matched())
+  {
+    pos= save_pos;
+    return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* 
+  Skip record from a hashed join buffer if its match flag dictates to do so
+
+  SYNOPSIS
+    skip_if_uneeded_match()
+
+  DESCRIPTION
+    This implementation of the virtual function skip_if_not_needed_match does
+    the same as the default implementation does, but it takes into account
+    the link element used to connect the records with the same key into a chain. 
+
+  RETURN VALUE
+    TRUE    the match flag dictates to skip the record
+    FALSE   the match flag is off 
+*/
+
+bool JOIN_CACHE_HASHED::skip_if_not_needed_match()
+{
+  uchar *save_pos= pos;
+  pos+= get_size_of_rec_offset();
+  if (!this->JOIN_CACHE::skip_if_not_needed_match())
   {
     pos= save_pos;
     return FALSE;
@@ -2831,16 +3003,16 @@ bool JOIN_CACHE_BKA_UNIQUE::skip_record_if_match()
     Otherwise the function returns the position where the reference to the
     newly created hash element for the given key is to be added.  
 
-  RETURN
-    TRUE  - the key is found in the hash table
-    FALSE - otherwise
+  RETURN VALUE
+    TRUE    the key is found in the hash table
+    FALSE   otherwise
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::key_search(uchar *key, uint key_len,
-                                       uchar **key_ref_ptr) 
+bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
+                                   uchar **key_ref_ptr) 
 {
   bool is_found= FALSE;
-  uint idx= get_hash_idx(key, key_length);
+  uint idx= (this->*hash_func)(key, key_length);
   uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
   while (!is_null_key_ref(ref_ptr))
   {
@@ -2849,7 +3021,7 @@ bool JOIN_CACHE_BKA_UNIQUE::key_search(uchar *key, uint key_len,
     next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
                             ref_ptr-key_length;
 
-    if (memcmp(next_key, key, key_len) == 0)
+    if ((this->*hash_cmp_func)(next_key, key, key_len))
     {
       is_found= TRUE;
       break;
@@ -2861,22 +3033,24 @@ bool JOIN_CACHE_BKA_UNIQUE::key_search(uchar *key, uint key_len,
 
 
 /* 
-  Calclulate hash value for a key in the hash table of the join buffer
+  Hash function that considers a key in the hash table as byte array
 
   SYNOPSIS
-    get_hash_idx()
+    get_hash_idx_simple()
       key             pointer to the key value
       key_len         key value length
       
   DESCRIPTION
     The function calculates an index of the hash entry in the hash table
-    of the join buffer for the given key  
+    of the join buffer for the given key. It considers the key just as
+    a sequence of bytes of the length key_len.
 
-  RETURN
-    the calculated index of the hash entry for the given key.  
+  RETURN VALUE
+    the calculated index of the hash entry for the given key  
 */
 
-uint JOIN_CACHE_BKA_UNIQUE::get_hash_idx(uchar* key, uint key_len)
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
 {
   ulong nr= 1;
   ulong nr2= 4;
@@ -2892,6 +3066,93 @@ uint JOIN_CACHE_BKA_UNIQUE::get_hash_idx(uchar* key, uint key_len)
 
 
 /* 
+  Hash function that takes into account collations of the components of the key  
+
+  SYNOPSIS
+    get_hash_idx_complex()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function calculates an index of the hash entry in the hash table
+    of the join buffer for the given key. It takes into account that the
+    components of the key may be of a varchar type with different collations.
+    The function guarantees that the same hash value for any two equal
+    keys that may differ as byte sequences.
+    The function takes the info about the components of the key, their
+    types and used collations from the class member ref_key_info containing
+    a pointer to the descriptor of the index that can be used for the join
+    operation.
+
+  RETURN VALUE
+    the calculated index of the hash entry for the given key  
+*/
+
+inline
+uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
+{
+  return 
+    (uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
+}
+
+
+/* 
+  Compare two key entries in the hash table as sequence of bytes
+
+  SYNOPSIS
+    equal_keys_simple()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function compares two key entries in the hash table key1 and key2
+    as two sequences bytes of the length key_len
+
+  RETURN VALUE
+    TRUE       key1 coincides with key2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return memcmp(key1, key2, key_len) == 0;
+}
+
+
+/* 
+  Compare two key entries taking into account the used collation
+
+  SYNOPSIS
+    equal_keys_complex()
+      key1            pointer to the first key entry
+      key2            pointer to the second key entry 
+      key_len         the length of the key values
+      
+  DESCRIPTION
+    The function checks whether two key entries in the hash table
+    key1 and key2 are equal as, possibly, compound keys of a certain
+    structure whose components may be of a varchar type and may
+    employ different collations.
+    The descriptor of the key structure is taken from the class
+    member ref_key_info.
+
+  RETURN VALUE
+    TRUE       key1 is equal tokey2
+    FALSE      otherwise
+*/
+
+inline
+bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
+                                          uint key_len)
+{
+  return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
+}
+
+
+/* 
   Clean up the hash table of the join buffer
 
   SYNOPSIS
@@ -2903,11 +3164,11 @@ uint JOIN_CACHE_BKA_UNIQUE::get_hash_idx(uchar* key, uint key_len)
     The function cleans up the hash table in the join buffer removing all
     hash elements from the table. 
 
-  RETURN
+  RETURN VALUE
     none  
 */
 
-void JOIN_CACHE_BKA_UNIQUE:: cleanup_hash_table()
+void JOIN_CACHE_HASHED:: cleanup_hash_table()
 {
   last_key_entry= hash_table;
   bzero(hash_table, (buff+buff_size)-hash_table);
@@ -2916,64 +3177,726 @@ void JOIN_CACHE_BKA_UNIQUE:: cleanup_hash_table()
 
 
 /*
-  Initialize retrieval of range sequence for BKA_UNIQUE algorithm
+  Check whether all records in a key chain have their match flags set on   
+
+  SYNOPSIS
+    check_all_match_flags_for_key()
+      key_chain_ptr     
+
+  DESCRIPTION
+    This function retrieves records in the given circular chain and checks
+    whether their match flags are set on. The parameter key_chain_ptr shall
+    point to the position in the join buffer storing the reference to the
+    last element of this chain. 
+            
+  RETURN VALUE
+    TRUE   if each retrieved record has its match flag set to MATCH_FOUND
+    FALSE  otherwise 
+*/
+
+bool JOIN_CACHE_HASHED::check_all_match_flags_for_key(uchar *key_chain_ptr)
+{
+  uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  do
+  {
+    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
+    if (get_match_flag_by_pos(rec_ptr) != MATCH_FOUND)
+      return FALSE;
+  }
+  while (next_rec_ref_ptr != last_rec_ref_ptr);
+  return TRUE;
+}
+  
+
+/* 
+  Get the next key built for the records from the buffer of a hashed join cache
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads the next key value stored in the hash table of the
+    join buffer. Depending on the value of the use_emb_key flag of the
+    join cache the value is read either from the table itself or from
+    the record field where it occurs. 
+
+  RETURN VALUE
+    length of the key value - if the starting value of 'cur_key_entry' refers
+    to the position after that referred by the the value of 'last_key_entry',    
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_HASHED::get_next_key(uchar ** key)
+{  
+  if (curr_key_entry == last_key_entry)
+    return 0;
+
+  curr_key_entry-= key_entry_length;
+
+  *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
+
+  DBUG_ASSERT(*key >= buff && *key < hash_table);
+
+  return key_length;
+}
+
+
+/* 
+  Initiate an iteration process over records in the joined table
+
+  SYNOPSIS
+    open()
+
+  DESCRIPTION
+    The function initiates the process of iteration over records from the 
+    joined table recurrently performed by the BNL/BKLH join algorithm.  
+
+  RETURN VALUE   
+    0            the initiation is a success 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN::open()
+{
+  save_or_restore_used_tabs(join_tab, FALSE);
+  is_first_record= TRUE;
+  return join_init_read_record(join_tab);
+}
+
+
+/* 
+  Read the next record that can match while scanning the joined table
+
+  SYNOPSIS
+    next()
+
+  DESCRIPTION
+    The function reads the next record from the joined table that can
+    match some records in the buffer of the join cache 'cache'. To do
+    this the function calls the function that scans table records and
+    looks for the next one that meets the condition pushed to the
+    joined table join_tab.
+
+  NOTES
+    The function catches the signal that kills the query.
+
+  RETURN VALUE   
+    0            the next record exists and has been successfully read 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN::next()
+{
+  int err= 0;
+  int skip_rc;
+  READ_RECORD *info= &join_tab->read_record;
+  SQL_SELECT *select= join_tab->cache_select;
+  if (is_first_record)
+    is_first_record= FALSE;
+  else
+    err= info->read_record(info);
+  if (!err)
+    update_virtual_fields(join->thd, join_tab->table);
+  while (!err && select && (skip_rc= select->skip_record(join->thd)) <= 0)
+  {
+    if (join->thd->killed || skip_rc < 0) 
+      return 1;
+    /* 
+      Move to the next record if the last retrieved record does not
+      meet the condition pushed to the table join_tab.
+    */
+    err= info->read_record(info);
+    if (!err)
+      update_virtual_fields(join->thd, join_tab->table);
+  } 
+  return err; 
+}
+
+
+/*
+  Walk back in join order from join_tab until we encounter a join tab with
+  tab->cache!=NULL, and save/restore tab->table->status along the way.
+
+  @param save TRUE   save 
+              FALSE  restore
+*/
+
+static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save)
+{
+  JOIN_TAB *first= join_tab->bush_root_tab?
+                     join_tab->bush_root_tab->bush_children->start :
+                     join_tab->join->join_tab + join_tab->join->const_tables;
+
+  for (JOIN_TAB *tab= join_tab-1; tab != first && !tab->cache; tab--)
+  {
+    if (tab->bush_children)
+    {
+      for (JOIN_TAB *child= tab->bush_children->start;
+           child != tab->bush_children->end;
+           child++)
+      {
+        if (save)
+          child->table->status= child->status;
+        else
+        {
+          tab->status= tab->table->status;
+          tab->table->status= 0;
+        }
+      }
+    }
+
+    if (save)
+      tab->table->status= tab->status;
+    else
+    {
+      tab->status= tab->table->status;
+      tab->table->status= 0;
+    }
+  }
+}
+
+
+/* 
+  Perform finalizing actions for a scan over the table records
+
+  SYNOPSIS
+    close()
+
+  DESCRIPTION
+    The function performs the necessary restoring actions after
+    the table scan over the joined table has been finished.
+
+  RETURN VALUE   
+    none      
+*/
+
+void JOIN_TAB_SCAN::close()
+{
+  save_or_restore_used_tabs(join_tab, TRUE);
+}
+
+
+/*
+  Prepare to iterate over the BNL join cache buffer to look for matches 
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last   <-> ignore the last record in the buffer
+
+  DESCRIPTION
+    The function prepares the join cache for an iteration over the
+    records in the join buffer. The iteration is performed when looking
+    for matches for the record from the joined table join_tab that 
+    has been placed into the record buffer of the joined table.
+    If the value of the parameter skip_last is TRUE then the last
+    record from the join buffer is ignored.
+    The function initializes the counter of the records that have been
+    not iterated over yet.
+    
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BNL::prepare_look_for_matches(bool skip_last)
+{
+  if (!records)
+    return TRUE;
+  reset(FALSE);
+  rem_records= records-test(skip_last);
+  return rem_records == 0;
+}
+
+
+/*
+  Get next record from the BNL join cache buffer when looking for matches 
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
+  DESCRIPTION
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The methods performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_recurrent_candidate_for_match.    
+    This implementation of the virtual method get_next_candidate_for_match
+    just  decrements the counter of the records that are to be iterated over
+    and returns the current value of the cursor 'pos' as the position of 
+    the record to be processed. 
+    
+  RETURN VALUE    
+    pointer to the position right after the prefix of the current record
+    in the join buffer if the there is another record to iterate over,
+    0 - otherwise.  
+*/
+
+uchar *JOIN_CACHE_BNL::get_next_candidate_for_match()
+{
+  if (!rem_records)
+    return 0;
+  rem_records--;
+  return pos+base_prefix_length;
+} 
+
+
+/*
+  Check whether the matching record from the BNL cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after the prefix 
+             of the current record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method skip_if_not_needed_match to check whether the record referenced by
+    ref_ptr has its match flag set either to MATCH_FOUND and join_tab is the
+    first inner table of a semi-join, or it's set to MATCH_IMPOSSIBLE and
+    join_tab is the first inner table of an outer join.
+    If so, the function just skips this record setting the value of the
+    cursor 'pos' to the position right after it.
+
+  RETURN VALUE    
+    TRUE   the record referenced by rec_ptr has been skipped
+    FALSE  otherwise  
+*/
+
+bool JOIN_CACHE_BNL::skip_next_candidate_for_match(uchar *rec_ptr)
+{
+  pos= rec_ptr-base_prefix_length; 
+  return skip_if_not_needed_match();
+}
+
+
+/*
+  Read next record from the BNL join cache buffer when looking for matches 
+
+  SYNOPSIS
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after the prefix
+             the current record.
+
+  DESCRIPTION
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record to read the record referenced by rec_ptr from
+    the join buffer into the record buffer. If this record refers to the
+    fields in the other join buffers the call of get_record ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
+*/
+
+void JOIN_CACHE_BNL::read_next_candidate_for_match(uchar *rec_ptr)
+{
+  pos= rec_ptr-base_prefix_length;
+  get_record();
+} 
+
+
+/*
+  Initialize the BNL join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BNL.
+
+  NOTES
+    The function first constructs a companion object of the type JOIN_TAB_SCAN,
+    then it calls the init method of the parent class.
+    
+  RETURN VALUE  
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BNL::init()
+{
+  DBUG_ENTER("JOIN_CACHE_BNL::init");
+
+  if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE::init());
+}
+
+
+/*
+  Get the chain of records from buffer matching the current candidate for join
+
+  SYNOPSIS
+    get_matching_chain_by_join_key()
+
+  DESCRIPTION
+    This function first build a join key for the record of join_tab that
+    currently is in the join buffer for this table. Then it looks for
+    the key entry with this key in the hash table of the join cache.
+    If such a key entry is found the function returns the pointer to
+    the head of the chain of records in the join_buffer that match this
+    key.
+
+  RETURN VALUE
+    The pointer to the corresponding circular list of records if
+    the key entry with the join key is found, 0 - otherwise.
+*/  
+
+uchar *JOIN_CACHE_BNLH::get_matching_chain_by_join_key()
+{
+  uchar *key_ref_ptr;
+  TABLE *table= join_tab->table;
+  TABLE_REF *ref= &join_tab->ref;
+  KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
+  /* Build the join key value out of the record in the record buffer */
+  key_copy(key_buff, table->record[0], keyinfo, key_length, TRUE);
+  /* Look for this key in the join buffer */
+  if (!key_search(key_buff, key_length, &key_ref_ptr))
+    return 0;
+  return key_ref_ptr+get_size_of_key_offset();
+}
+
+
+/*
+  Prepare to iterate over the BNLH join cache buffer to look for matches 
+
+  SYNOPSIS
+    prepare_look_for_matches()
+      skip_last   <-> ignore the last record in the buffer
+
+  DESCRIPTION
+    The function prepares the join cache for an iteration over the
+    records in the join buffer. The iteration is performed when looking
+    for matches for the record from the joined table join_tab that 
+    has been placed into the record buffer of the joined table.
+    If the value of the parameter skip_last is TRUE then the last
+    record from the join buffer is ignored.
+    The function builds the hashed key from the join fields of join_tab
+    and uses this key to look in the hash table of the join cache for
+    the chain of matching records in in the join buffer. If it finds
+    such a chain it sets  the member last_rec_ref_ptr to point to the
+    last link of the chain while setting the member next_rec_ref_po 0.
+    
+  RETURN VALUE    
+    TRUE    there are no matching records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BNLH::prepare_look_for_matches(bool skip_last)
+{
+  uchar *curr_matching_chain;
+  last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
+  if (!(curr_matching_chain= get_matching_chain_by_join_key()))
+    return 1;
+  last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain); 
+  return 0;
+}
+
+
+/*
+  Get next record from the BNLH join cache buffer when looking for matches 
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
+  DESCRIPTION
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The methods performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_next_candidate_for_match.    
+    This implementation of the virtual method moves to the next record
+    in the chain of all records from the join buffer that are to be
+    equi-joined with the current record from join_tab.
+    
+  RETURN VALUE   
+    pointer to the beginning of the record fields in the join buffer
+    if the there is another record to iterate over, 0 - otherwise.  
+*/
+
+uchar *JOIN_CACHE_BNLH::get_next_candidate_for_match()
+{
+  if (next_matching_rec_ref_ptr == last_matching_rec_ref_ptr)
+    return 0;
+  next_matching_rec_ref_ptr= get_next_rec_ref(next_matching_rec_ref_ptr ?
+                                                next_matching_rec_ref_ptr :
+                                                last_matching_rec_ref_ptr);
+  return next_matching_rec_ref_ptr+rec_fields_offset; 
+} 
+
+
+/*
+  Check whether the matching record from the BNLH cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method get_match_flag_by_pos to check whether the record referenced
+    by ref_ptr has its match flag set to MATCH_FOUND.
+
+  RETURN VALUE    
+    TRUE   the record referenced by rec_ptr has its match flag set to 
+           MATCH_FOUND
+    FALSE  otherwise  
+*/
+
+bool JOIN_CACHE_BNLH::skip_next_candidate_for_match(uchar *rec_ptr)
+{
+ return  join_tab->check_only_first_match() &&
+          (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
+}
+
+
+/*
+  Read next record from the BNLH join cache buffer when looking for matches 
+
+  SYNOPSIS
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record_by_pos to read the record referenced by rec_ptr
+    from the join buffer into the record buffer. If this record refers to
+    fields in the other join buffers the call of get_record_by_po ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
+*/
+
+void JOIN_CACHE_BNLH::read_next_candidate_for_match(uchar *rec_ptr)
+{
+  get_record_by_pos(rec_ptr);
+} 
+
+
+/*
+  Initialize the BNLH join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BNLH.
+
+  NOTES
+    The function first constructs a companion object of the type JOIN_TAB_SCAN,
+    then it calls the init method of the parent class.
+    
+  RETURN VALUE  
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BNLH::init()
+{
+  DBUG_ENTER("JOIN_CACHE_BNLH::init");
+
+  if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE_HASHED::init());
+}
+
+
+/* 
+  Calculate the increment of the MRR buffer for a record write       
+
+  SYNOPSIS
+    aux_buffer_incr()
+
+  DESCRIPTION
+    This implementation of the virtual function aux_buffer_incr determines
+    for how much the size of the MRR buffer should be increased when another
+    record is added to the cache.   
+
+  RETURN VALUE
+    the increment of the size of the MRR buffer for the next record
+*/
+
+uint JOIN_TAB_SCAN_MRR::aux_buffer_incr(ulong recno)
+{
+  uint incr= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  TABLE *tab= join_tab->table;
+  uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1];
+  set_if_bigger(rec_per_key, 1);
+  if (recno == 1)
+    incr=  ref->key_length + tab->file->ref_length;
+  incr+= tab->file->stats.mrr_length_per_rec * rec_per_key;
+  return incr; 
+}
+
+
+/* 
+  Initiate iteration over records returned by MRR for the current join buffer
+
+  SYNOPSIS
+    open()
+
+  DESCRIPTION
+    The function initiates the process of iteration over the records from 
+    join_tab returned by the MRR interface functions for records from
+    the join buffer. Such an iteration is performed by the BKA/BKAH join
+    algorithm for each new refill of the join buffer.
+    The function calls the MRR handler function multi_range_read_init to
+    initiate this process.
+
+  RETURN VALUE   
+    0            the initiation is a success 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN_MRR::open()
+{
+  handler *file= join_tab->table->file;
+
+  join_tab->table->null_row= 0;
+
+
+  /* Dynamic range access is never used with BKA */
+  DBUG_ASSERT(join_tab->use_quick != 2);
+
+  save_or_restore_used_tabs(join_tab, FALSE);
+
+  init_mrr_buff();
+
+  /* 
+    Prepare to iterate over keys from the join buffer and to get
+    matching candidates obtained with MMR handler functions.
+  */ 
+  if (!file->inited)
+    file->ha_index_init(join_tab->ref.key, 1);
+  ranges= cache->get_number_of_ranges_for_mrr();
+  if (!join_tab->cache_idx_cond)
+    range_seq_funcs.skip_index_tuple= 0;
+  return file->multi_range_read_init(&range_seq_funcs, (void*) cache,
+                                     ranges, mrr_mode, &mrr_buff);
+}
+
+
+/* 
+  Read the next record returned by MRR for the current join buffer
+
+  SYNOPSIS
+    next()
+
+  DESCRIPTION
+    The function reads the next record from the joined table join_tab
+    returned by the MRR handler function multi_range_read_next for
+    the current refill of the join buffer. The record is read into
+    the record buffer used for join_tab records in join operations.
+
+  RETURN VALUE   
+    0            the next record exists and has been successfully read 
+    error code   otherwise     
+*/
+
+int JOIN_TAB_SCAN_MRR::next()
+{
+  char **ptr= (char **) cache->get_curr_association_ptr();
+
+  DBUG_ASSERT(sizeof(range_id_t) == sizeof(*ptr));
+  int rc= join_tab->table->file->multi_range_read_next((range_id_t*)ptr) ? -1 : 0;
+  if (!rc)
+  {
+    /* 
+      If a record in in an incremental cache contains no fields then the
+      association for the last record in cache will be equal to cache->end_pos
+    */ 
+    DBUG_ASSERT(cache->buff <= (uchar *) (*ptr) &&
+                (uchar *) (*ptr) <= cache->end_pos);
+    update_virtual_fields(join->thd, join_tab->table);
+  }
+  return rc;
+}
+
+
+static 
+void bka_range_seq_key_info(void *init_params, uint *length, 
+                            key_part_map *map)
+{
+  TABLE_REF *ref= &(((JOIN_CACHE*)init_params)->join_tab->ref);
+  *length= ref->key_length;
+  *map= (key_part_map(1) << ref->key_parts) - 1;
+}
+
+
+/*
+  Initialize retrieval of range sequence for BKA join algorithm
     
   SYNOPSIS
     bka_range_seq_init()
-      init_params   pointer to the BKA_INIQUE join cache object
-      n_ranges      the number of ranges obtained 
-      flags         combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+     init_params   pointer to the BKA join cache object
+     n_ranges      the number of ranges obtained 
+     flags         combination of MRR flags
 
   DESCRIPTION
-    The function interprets init_param as a pointer to a JOIN_CACHE_BKA_UNIQUE
-    object. The function prepares for an iteration over the unique join keys
-    built over the records from the cache join buffer.
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKA
+    object. The function prepares for an iteration over the join keys
+    built for all records from the cache join buffer.
 
   NOTE
     This function are used only as a callback function.    
 
-  RETURN
-    init_param    value that is to be used as a parameter of 
-                  bka_unique_range_seq_next()
+  RETURN VALUE
+    init_param value that is to be used as a parameter of bka_range_seq_next()
 */    
 
 static 
-range_seq_t bka_unique_range_seq_init(void *init_param, uint n_ranges,
-                                      uint flags)
+range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
 {
-  DBUG_ENTER("bka_unique_range_seq_init");
-  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) init_param;
+  DBUG_ENTER("bka_range_seq_init");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
   cache->reset(0);
   DBUG_RETURN((range_seq_t) init_param);
 }
 
 
 /*
-  Get the key over the next record from the join buffer used by BKA_UNIQUE  
+  Get the next range/key over records from the join buffer used by a BKA cache
     
   SYNOPSIS
-    bka_unique_range_seq_next()
-      seq        value returned by  bka_unique_range_seq_init()
+    bka_range_seq_next()
+      seq        the value returned by  bka_range_seq_init
       range  OUT reference to the next range
   
   DESCRIPTION
-    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE 
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA
     object. The function returns a pointer to the range descriptor
-    for the next unique key built over records from the join buffer.
+    for the key built over the next record from the join buffer.
 
   NOTE
     This function are used only as a callback function.
    
-  RETURN
-    0    ok, the range structure filled with info about the next key
-    1    no more ranges
+  RETURN VALUE
+    FALSE   ok, the range structure filled with info about the next range/key
+    TRUE    no more ranges
 */    
 
 static 
-uint bka_unique_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+bool bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 {
-  DBUG_ENTER("bka_unique_range_seq_next");
-  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
+  DBUG_ENTER("bka_range_seq_next");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
   TABLE_REF *ref= &cache->join_tab->ref;
   key_range *start_key= &range->start_key;
   if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
@@ -2982,7 +3905,7 @@ uint bka_unique_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
     start_key->flag= HA_READ_KEY_EXACT;
     range->end_key= *start_key;
     range->end_key.flag= HA_READ_AFTER_KEY;
-    range->ptr= (char *) cache->get_curr_key_chain();
+    range->ptr= (char *) cache->get_curr_rec();
     range->range_flag= EQ_RANGE;
     DBUG_RETURN(0);
   } 
@@ -2991,305 +3914,663 @@ uint bka_unique_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
 
 
 /*
-  Check whether range_info orders to skip the next record from BKA_UNIQUE buffer
+  Check whether range_info orders to skip the next record from BKA buffer
 
   SYNOPSIS
-    bka_unique_range_seq_skip_record()
-      seq              value returned by bka_unique_range_seq_init()
+    bka_range_seq_skip_record()
+      seq              value returned by bka_range_seq_init()
       range_info       information about the next range
-      rowid [NOT USED] rowid of the record to be checked (not used)
+      rowid [NOT USED] rowid of the record to be checked 
+
     
   DESCRIPTION
-    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE
-    object. The function returns TRUE if the record with this range_info
-    is to be filtered out from the stream of records returned by
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
+    The function returns TRUE if the record with this range_info 
+    is to be filtered out from the stream of records returned by 
     multi_range_read_next(). 
 
   NOTE
     This function are used only as a callback function.
 
-  RETURN
+  RETURN VALUE
     1    record with this range_info is to be filtered out from the stream
          of records returned by multi_range_read_next()
     0    the record is to be left in the stream
 */ 
 
 static 
-bool bka_unique_range_seq_skip_record(range_seq_t rseq, char *range_info,
-                                      uchar *rowid)
+bool bka_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
 {
-  DBUG_ENTER("bka_unique_range_seq_skip_record");
-  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
-  bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
+  DBUG_ENTER("bka_range_seq_skip_record");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  bool res= cache->get_match_flag_by_pos((uchar *) range_info) ==
+            JOIN_CACHE::MATCH_FOUND;
   DBUG_RETURN(res);
 }
 
- 
+
 /*
-  Check if the record combination matches the index condition
+  Check if the record combination from BKA cache matches the index condition
 
   SYNOPSIS
-    JOIN_CACHE_BKA_UNIQUE::skip_index_tuple()
-      rseq             Value returned by bka_range_seq_init()
-      range_info       MRR range association data
+    bka_skip_index_tuple()
+      rseq             value returned by bka_range_seq_init()
+      range_info       record chain for the next range/key returned by MRR
     
   DESCRIPTION
-    See JOIN_CACHE_BKA::skip_index_tuple().
-    This function is the variant for use with
-    JOIN_CACHE_BKA_UNIQUE. The difference from JOIN_CACHE_BKA case is that
-    there may be multiple previous table record combinations that share the
-    same key, i.e. they map to the same MRR range.
-    As a consequence, we need to loop through all previous table record
-    combinations that match the given MRR range key range_info until we find
-    one that satisfies the index condition.
+    This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
+    see comments there.
 
   NOTE
-    Possible optimization:
-    Before we unpack the record from a previous table
-    check if this table is used in the condition.
-    If so then unpack the record otherwise skip the unpacking.
-    This should be done by a special virtual method
-    get_partial_record_by_pos().
-
-  RETURN
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN VALUE
     0    The record combination satisfies the index condition
     1    Otherwise
-
-
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::skip_index_tuple(range_seq_t rseq, char *range_info)
+static 
+bool bka_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
 {
-  DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::skip_index_tuple");
-  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
-  uchar *last_rec_ref_ptr=  cache->get_next_rec_ref((uchar*) range_info);
-  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
-  do
-  {
-    next_rec_ref_ptr= cache->get_next_rec_ref(next_rec_ref_ptr);
-    uchar *rec_ptr= next_rec_ref_ptr + cache->rec_fields_offset;
-    cache->get_record_by_pos(rec_ptr);
-    if (join_tab->cache_idx_cond->val_int())
-      DBUG_RETURN(FALSE);
-  } while(next_rec_ref_ptr != last_rec_ref_ptr);
-  DBUG_RETURN(TRUE);
+  DBUG_ENTER("bka_skip_index_tuple");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  bool res= cache->skip_index_tuple(range_info);
+  DBUG_RETURN(res);
 }
 
 
 /*
-  Check if the record combination matches the index condition
+  Prepare to read the record from BKA cache matching the current joined record   
 
   SYNOPSIS
-    bka_unique_skip_index_tuple()
-      rseq             Value returned by bka_range_seq_init()
-      range_info       MRR range association data
+    prepare_look_for_matches()
+      skip_last <-> ignore the last record in the buffer (always unused here)
+
+  DESCRIPTION
+    The function prepares to iterate over records in the join cache buffer
+    matching the record loaded into the record buffer for join_tab when
+    performing join operation by BKA join algorithm. With BKA algorithms the
+    record loaded into the record buffer for join_tab always has a direct
+    reference to the matching records from the join buffer. When the regular
+    BKA join algorithm is employed the record from join_tab can refer to
+    only one such record.   
+    The function sets the counter of the remaining records from the cache 
+    buffer that would match the current join_tab record to 1.
     
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BKA::prepare_look_for_matches(bool skip_last)
+{
+  if (!records)
+    return TRUE;
+  rem_records= 1;
+  return FALSE;
+}
+
+
+/*
+  Get the record from the BKA cache matching the current joined record   
+
+  SYNOPSIS
+    get_next_candidate_for_match
+
   DESCRIPTION
-    This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
-    see comments there.
+    This method is used for iterations over the records from the join
+    cache buffer when looking for matches for records from join_tab.
+    The method performs the necessary preparations to read the next record
+    from the join buffer into the record buffer by the method
+    read_next_candidate_for_match, or, to skip the next record from the join 
+    buffer by the method skip_if_not_needed_match.    
+    This implementation of the virtual method get_next_candidate_for_match
+    just  decrements the counter of the records that are to be iterated over
+    and returns the value of curr_association as a reference to the position
+    of the beginning of the record fields in the buffer.
+    
+  RETURN VALUE   
+    pointer to the start of the record fields in the join buffer
+    if the there is another record to iterate over, 0 - otherwise.  
+*/
 
-  NOTE
-    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
- 
-  RETURN
-    0    The record combination satisfies the index condition
-    1    Otherwise
+uchar *JOIN_CACHE_BKA::get_next_candidate_for_match()
+{
+  if (!rem_records)
+    return 0;
+  rem_records--;
+  return curr_association;
+} 
+
+
+/*
+  Check whether the matching record from the BKA cache is to be skipped 
+
+  SYNOPSIS
+    skip_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
+
+  DESCRIPTION
+    This implementation of the virtual function just calls the
+    method get_match_flag_by_pos to check whether the record referenced
+    by ref_ptr has its match flag set to MATCH_FOUND.
+
+  RETURN VALUE   
+    TRUE   the record referenced by rec_ptr has its match flag set to
+           MATCH_FOUND
+    FALSE  otherwise  
 */
 
-static 
-bool bka_unique_skip_index_tuple(range_seq_t rseq, char *range_info)
+bool JOIN_CACHE_BKA::skip_next_candidate_for_match(uchar *rec_ptr)
 {
-  DBUG_ENTER("bka_unique_skip_index_tuple");
-  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
-  DBUG_RETURN(cache->skip_index_tuple(rseq, range_info));
+  return join_tab->check_only_first_match() && 
+         (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
 }
 
 
 /*
-  Using BKA_UNIQUE find matches from the next table for records from join buffer   
+  Read the next record from the BKA join cache buffer when looking for matches 
 
   SYNOPSIS
-    join_matching_records()
-      skip_last    do not look for matches for the last partial join record 
+    read_next_candidate_for_match
+    rec_ptr  pointer to the position in the join buffer right after 
+             the previous record
 
   DESCRIPTION
-    This function can be used only when the table join_tab can be accessed
-    by keys built over the fields of previous join tables.
-    The function retrieves all keys from the hash table of the join buffer
-    built for partial join records from the buffer. For each of these keys
-    the function performs an index lookup and tries to match records yielded
-    by this lookup with records from the join buffer attached to the key.
-    If a match is found the function will call the sub_select function trying
-    to look for matches for the remaining join operations.
-    This function does not assume that matching records are necessarily
-    returned with references to the keys by which they were found. If the call
-    of the function multi_range_read_init returns flags with
-    HA_MRR_NO_ASSOCIATION then a search for the key built from the returned
-    record is carried on. The search is performed by probing in in the hash
-    table of the join buffer.
-    This function currently is called only from the function join_records.    
-    It's assumed that this function is always called with the skip_last 
-    parameter equal to false.
-            
-  RETURN
-    return one of enum_nested_loop_state 
+    This implementation of the virtual method read_next_candidate_for_match
+    calls the method get_record_by_pos to read the record referenced by rec_ptr
+    from the join buffer into the record buffer. If this record refers to
+    fields in the other join buffers the call of get_record_by_po ensures that
+    these fields are read into the corresponding record buffers as well.
+    This function is supposed to be called after a successful call of
+    the method get_next_candidate_for_match.
+    
+  RETURN VALUE   
+    none
 */
 
-enum_nested_loop_state 
-JOIN_CACHE_BKA_UNIQUE::join_matching_records(bool skip_last)
+void JOIN_CACHE_BKA::read_next_candidate_for_match(uchar *rec_ptr)
 {
-  int error;
-  uchar *key_chain_ptr;
-  handler *file= join_tab->table->file;
-  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  get_record_by_pos(rec_ptr);
+} 
+
+
+/*
+  Initialize the BKA join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BKA.
+
+  NOTES
+    The function first constructs a companion object of the type 
+    JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
+    
+  RETURN VALUE   
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BKA::init()
+{
+  int res;
   bool check_only_first_match= join_tab->check_only_first_match();
-  bool no_association= test(mrr_mode &  HA_MRR_NO_ASSOCIATION);
 
-  /* Set functions to iterate over keys in the join buffer */
-  RANGE_SEQ_IF seq_funcs= { bka_unique_range_seq_init,
-                            bka_unique_range_seq_next,
-                            check_only_first_match && !no_association ?
-                              bka_unique_range_seq_skip_record : 0,
-                            join_tab->cache_idx_cond ?
-                              bka_unique_skip_index_tuple : 0  };
+  RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
+                           bka_range_seq_init, 
+                           bka_range_seq_next,
+                           check_only_first_match ?
+                             bka_range_seq_skip_record : 0,
+                           bka_skip_index_tuple };
 
-  /* The value of skip_last must be always FALSE when this function is called */
-  DBUG_ASSERT(!skip_last);
+  DBUG_ENTER("JOIN_CACHE_BKA::init");
 
-  /* Return at once if there are no records in the join buffer */
-  if (!records)
-    return NESTED_LOOP_OK;  
-                   
-  rc= init_join_matching_records(&seq_funcs, key_entries);
-  if (rc != NESTED_LOOP_OK)
-    goto finish;
+  JOIN_TAB_SCAN_MRR *jsm;
+  if (!(join_tab_scan= jsm= new JOIN_TAB_SCAN_MRR(join, join_tab, 
+                                                  mrr_mode, rs_funcs)))
+    DBUG_RETURN(1);
 
-  while (!(error= file->multi_range_read_next((char **) &key_chain_ptr)))
-  {
-    if (no_association)
-    {
-      uchar *key_ref_ptr;
-      TABLE *table= join_tab->table;
-      TABLE_REF *ref= &join_tab->ref;
-      KEY *keyinfo= table->key_info+ref->key;
-      /* 
-        Build the key value out of  the record returned by the call of
-        multi_range_read_next in the record buffer
-      */ 
-      key_copy(ref->key_buff, table->record[0], keyinfo, ref->key_length);
-      /* Look for this key in the join buffer */
-      if (!key_search(ref->key_buff, ref->key_length, &key_ref_ptr))
-	continue;
-      key_chain_ptr= key_ref_ptr+get_size_of_key_offset();
-    } 
+  if ((res= JOIN_CACHE::init()))
+    DBUG_RETURN(res);
 
-    uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
-    uchar *next_rec_ref_ptr= last_rec_ref_ptr;
-    do
-    {
-      next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
-      uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
+  if (use_emb_key)
+    jsm->mrr_mode |= HA_MRR_MATERIALIZED_KEYS;
 
-      if (join->thd->killed)
-      {
-        /* The user has aborted the execution of the query */
-        join->thd->send_kill_message();
-        rc= NESTED_LOOP_KILLED; 
-        goto finish;
-      }
-      /* 
-        If only the first match is needed and it has been already found
-        for the associated partial join record then the returned candidate
-        is discarded.
-      */
-      if (rc == NESTED_LOOP_OK &&
-          (!check_only_first_match || !get_match_flag_by_pos(rec_ptr)))
-      {
-        get_record_by_pos(rec_ptr);
-        update_virtual_fields(join->thd, join_tab->table);
-        rc= generate_full_extensions(rec_ptr);
-        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
-	  goto finish;   
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Get the key built over the next record from BKA join buffer
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads key fields from the current record in the join buffer.
+    and builds the key value out of these fields that will be used to access
+    the 'join_tab' table. Some of key fields may belong to previous caches.
+    They are accessed via record references to the record parts stored in the
+    previous join buffers. The other key fields always are placed right after
+    the flag fields of the record.
+    If the key is embedded, which means that its value can be read directly
+    from the join buffer, then *key is set to the beginning of the key in
+    this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
+    The function returns the length of the key if it succeeds ro read it.
+    If is assumed that the functions starts reading at the position of
+    the record length which is provided for each records in a BKA cache.
+    After the key is built the 'pos' value points to the first position after
+    the current record.
+    The function just skips the records with MATCH_IMPOSSIBLE in the
+    match flag field if there is any. 
+    The function returns 0 if the initial position is after the beginning
+    of the record fields for last record from the join buffer. 
+
+  RETURN VALUE
+    length of the key value - if the starting value of 'pos' points to
+    the position before the fields for the last record,
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
+{
+  uint len;
+  uint32 rec_len;
+  uchar *init_pos;
+  JOIN_CACHE *cache;
+  
+start:
+
+  /* Any record in a BKA cache is prepended with its length */
+  DBUG_ASSERT(with_length);
+   
+  if ((pos+size_of_rec_len) > last_rec_pos || !records)
+    return 0;
+
+  /* Read the length of the record */
+  rec_len= get_rec_length(pos);
+  pos+= size_of_rec_len; 
+  init_pos= pos;
+
+  /* Read a reference to the previous cache if any */
+  if (prev_cache)
+    pos+= prev_cache->get_size_of_rec_offset();
+
+  curr_rec_pos= pos;
+
+  /* Read all flag fields of the record */
+  read_flag_fields();
+
+  if (with_match_flag && 
+      (Match_flag) curr_rec_pos[0] == MATCH_IMPOSSIBLE )
+  {
+    pos= init_pos+rec_len;
+    goto start;
+  }
+ 
+  if (use_emb_key)
+  {
+    /* An embedded key is taken directly from the join buffer */
+    *key= pos;
+    len= emb_key_length;
+  }
+  else
+  {
+    /* Read key arguments from previous caches if there are any such fields */
+    if (external_key_arg_fields)
+    {
+      uchar *rec_ptr= curr_rec_pos;
+      uint key_arg_count= external_key_arg_fields;
+      CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
+      for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
+      { 
+        uint len= 0;
+        DBUG_ASSERT(cache);
+        rec_ptr= cache->get_rec_ref(rec_ptr);
+        while (!cache->referenced_fields)
+        {
+          cache= cache->prev_cache;
+          DBUG_ASSERT(cache);
+          rec_ptr= cache->get_rec_ref(rec_ptr);
+        }
+        while (key_arg_count && 
+               cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
+        {
+          copy_ptr++;
+          --key_arg_count;
+        }
       }
     }
-    while (next_rec_ref_ptr != last_rec_ref_ptr); 
+    
+    /* 
+      Read the other key arguments from the current record. The fields for
+      these arguments are always first in the sequence of the record's fields.
+    */     
+    CACHE_FIELD *copy= field_descr+flag_fields;
+    CACHE_FIELD *copy_end= copy+local_key_arg_fields;
+    bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
+    for ( ; copy < copy_end; copy++)
+      read_record_field(copy, blob_in_rec_buff);
+    
+    /* Build the key over the fields read into the record buffers */ 
+    TABLE_REF *ref= &join_tab->ref;
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    *key= ref->key_buff;
+    len= ref->key_length;
   }
 
-  if (error > 0 && error != HA_ERR_END_OF_FILE)	   
-    return NESTED_LOOP_ERROR; 
-finish:                  
-  return end_join_matching_records(rc);
+  pos= init_pos+rec_len;
+
+  return len;
+} 
+
+
+/*
+  Check the index condition of the joined table for a record from the BKA cache
+
+  SYNOPSIS
+    skip_index_tuple()
+      range_info       pointer to the record returned by MRR 
+    
+  DESCRIPTION
+    This function is invoked from MRR implementation to check if an index
+    tuple matches the index condition. It is used in the case where the index
+    condition actually depends on both columns of the used index and columns
+    from previous tables.
+   
+  NOTES 
+    Accessing columns of the previous tables requires special handling with
+    BKA. The idea of BKA is to collect record combinations in a buffer and 
+    then do a batch of ref access lookups, i.e. by the time we're doing a
+    lookup its previous-records-combination is not in prev_table->record[0]
+    but somewhere in the join buffer.    
+    We need to get it from there back into prev_table(s)->record[0] before we
+    can evaluate the index condition, and that's why we need this function
+    instead of regular IndexConditionPushdown.
+
+  NOTES
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN VALUE
+    1    the record combination does not satisfies the index condition
+    0    otherwise
+*/
+
+bool JOIN_CACHE_BKA::skip_index_tuple(range_id_t range_info)
+{
+  DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
+  get_record_by_pos((uchar*)range_info);
+  DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
 }
 
 
+
 /*
-  Check whether all records in a key chain have their match flags set on   
+  Initialize retrieval of range sequence for the BKAH join algorithm
+    
+  SYNOPSIS
+    bkah_range_seq_init()
+      init_params   pointer to the BKAH join cache object
+      n_ranges      the number of ranges obtained 
+      flags         combination of MRR flags
+
+  DESCRIPTION
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKAH
+    object. The function prepares for an iteration over distinct join keys
+    built over the records from the cache join buffer.
 
+  NOTE
+    This function are used only as a callback function.    
+
+  RETURN VALUE
+    init_param    value that is to be used as a parameter of 
+                  bkah_range_seq_next()
+*/    
+
+static 
+range_seq_t bkah_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  DBUG_ENTER("bkah_range_seq_init");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+
+/*
+  Get the next range/key over records from the join buffer of a BKAH cache  
+    
   SYNOPSIS
-    check_all_match_flags_for_key()
-      key_chain_ptr     
+    bkah_range_seq_next()
+      seq        value returned by  bkah_range_seq_init()
+      range  OUT reference to the next range
+  
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKAH 
+    object. The function returns a pointer to the range descriptor
+    for the next unique key built over records from the join buffer.
+
+  NOTE
+    This function are used only as a callback function.
+   
+  RETURN VALUE
+    FALSE  ok, the range structure filled with info about the next range/key
+    TRUE   no more ranges
+*/    
 
+static 
+bool bkah_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bkah_range_seq_next");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_curr_key_chain();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Check whether range_info orders to skip the next record from BKAH join buffer
+
+  SYNOPSIS
+    bkah_range_seq_skip_record()
+      seq              value returned by bkah_range_seq_init()
+      range_info       information about the next range/key returned by MRR
+      rowid [NOT USED] rowid of the record to be checked (not used)
+    
   DESCRIPTION
-    This function retrieves records in the given circular chain and checks
-    whether their match flags are set on. The parameter key_chain_ptr shall
-    point to the position in the join buffer storing the reference to the
-    last element of this chain. 
-            
-  RETURN
-    TRUE   if each retrieved record has its match flag set on
-    FALSE  otherwise 
+    The function interprets seq as a pointer to a JOIN_CACHE_BKAH
+    object. The function returns TRUE if the record with this range_info
+    is to be filtered out from the stream of records returned by
+    multi_range_read_next(). 
+
+  NOTE
+    This function are used only as a callback function.
+
+  RETURN VALUE
+    1    record with this range_info is to be filtered out from the stream
+         of records returned by multi_range_read_next()
+    0    the record is to be left in the stream
+*/ 
+
+static 
+bool bkah_range_seq_skip_record(range_seq_t rseq, range_id_t range_info,
+                                uchar *rowid)
+{
+  DBUG_ENTER("bkah_range_seq_skip_record");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
+  DBUG_RETURN(res);
+}
+
+ 
+/*
+  Check if the record combination from BKAH cache matches the index condition
+
+  SYNOPSIS
+    bkah_skip_index_tuple()
+      rseq             value returned by bka_range_seq_init()
+      range_info       record chain for the next range/key returned by MRR
+    
+  DESCRIPTION
+    This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
+    see comments there.
+
+  NOTE
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN VALUE
+    0    some records from the chain satisfy the index condition
+    1    otherwise
 */
 
-bool JOIN_CACHE_BKA_UNIQUE::check_all_match_flags_for_key(uchar *key_chain_ptr)
+static 
+bool bkah_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
 {
-  uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
-  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
-  do
-  {
-    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
-    uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
-    if (!get_match_flag_by_pos(rec_ptr))
-      return FALSE;
-  }
-  while (next_rec_ref_ptr != last_rec_ref_ptr);
-  return TRUE;
+  DBUG_ENTER("bka_unique_skip_index_tuple");
+  JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
+  DBUG_RETURN(cache->skip_index_tuple(range_info));
 }
-  
 
-/* 
-  Get the next key built for the records from BKA_UNIQUE join buffer
+
+/*
+  Prepare to read record from BKAH cache matching the current joined record   
 
   SYNOPSIS
-    get_next_key()
-      key    pointer to the buffer where the key value is to be placed
+    prepare_look_for_matches()
+      skip_last <-> ignore the last record in the buffer (always unused here)
 
   DESCRIPTION
-    The function reads the next key value stored in the hash table of the
-    join buffer. Depending on the value of the use_emb_key flag of the
-    join cache the value is read either from the table itself or from
-    the record field where it occurs. 
+    The function prepares to iterate over records in the join cache buffer
+    matching the record loaded into the record buffer for join_tab when
+    performing join operation by BKAH join algorithm. With BKAH algorithm, if
+    association labels are used, then record loaded into the record buffer 
+    for join_tab always has a direct reference to the chain of the mathing
+    records from the join buffer. If association labels are not used then
+    then the chain of the matching records is obtained by the call of the
+    get_key_chain_by_join_key function.
+    
+  RETURN VALUE   
+    TRUE    there are no records in the buffer to iterate over 
+    FALSE   otherwise
+*/
+    
+bool JOIN_CACHE_BKAH::prepare_look_for_matches(bool skip_last)
+{
+  last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
+  if (no_association &&
+      (curr_matching_chain= get_matching_chain_by_join_key()))
+    return 1;
+  last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
+  return 0;
+}
 
-  RETURN
-    length of the key value - if the starting value of 'cur_key_entry' refers
-    to the position after that referred by the the value of 'last_key_entry'    
-    0 - otherwise.     
+/*
+  Initialize the BKAH join cache 
+
+  SYNOPSIS
+    init
+
+  DESCRIPTION
+    The function initializes the cache structure. It is supposed to be called
+    right after a constructor for the JOIN_CACHE_BKAH.
+
+  NOTES
+    The function first constructs a companion object of the type 
+    JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
+    
+  RETURN VALUE   
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
 */
 
-uint JOIN_CACHE_BKA_UNIQUE::get_next_key(uchar ** key)
-{  
-  if (curr_key_entry == last_key_entry)
-    return 0;
+int JOIN_CACHE_BKAH::init()
+{
+  bool check_only_first_match= join_tab->check_only_first_match();
 
-  curr_key_entry-= key_entry_length;
+  no_association= test(mrr_mode & HA_MRR_NO_ASSOCIATION);
 
-  *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
+  RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
+                           bkah_range_seq_init,
+                           bkah_range_seq_next,
+                           check_only_first_match && !no_association ?
+                             bkah_range_seq_skip_record : 0,
+                           bkah_skip_index_tuple };
 
-  DBUG_ASSERT(*key >= buff && *key < hash_table);
+  DBUG_ENTER("JOIN_CACHE_BKAH::init");
 
-  return key_length;
+  if (!(join_tab_scan= new JOIN_TAB_SCAN_MRR(join, join_tab, 
+                                             mrr_mode, rs_funcs)))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(JOIN_CACHE_HASHED::init());
 }
 
 
-/****************************************************************************
- * Join cache module end
- ****************************************************************************/
+/*
+  Check the index condition of the joined table for a record from the BKA cache
+
+  SYNOPSIS
+    skip_index_tuple()
+      range_info       record chain returned by MRR 
+    
+  DESCRIPTION
+    See JOIN_CACHE_BKA::skip_index_tuple().
+    This function is the variant for use with rhe class JOIN_CACHE_BKAH.
+    The difference from JOIN_CACHE_BKA case is that there may be multiple
+    previous table record combinations that share the same key(MRR range).
+    As a consequence, we need to loop through the chain of all table record
+    combinations that match the given MRR range key range_info until we find
+    one that satisfies the index condition.
+
+  NOTE
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN VALUE
+    1    any record combination from the chain referred by range_info
+         does not satisfy the index condition
+    0    otherwise
+
+
+*/
+
+bool JOIN_CACHE_BKAH::skip_index_tuple(range_id_t range_info)
+{
+  uchar *last_rec_ref_ptr= get_next_rec_ref((uchar*) range_info);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  DBUG_ENTER("JOIN_CACHE_BKAH::skip_index_tuple");
+  do
+  {
+    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr + rec_fields_offset;
+    get_record_by_pos(rec_ptr);
+    if (join_tab->cache_idx_cond->val_int())
+      DBUG_RETURN(FALSE);
+  } while(next_rec_ref_ptr != last_rec_ref_ptr);
+  DBUG_RETURN(TRUE);
+}
diff --git a/sql/sql_join_cache.h b/sql/sql_join_cache.h
new file mode 100644
index 00000000000..c153689bb99
--- /dev/null
+++ b/sql/sql_join_cache.h
@@ -0,0 +1,1412 @@
+/*
+  This file contains declarations for implementations
+  of block based join algorithms
+*/
+
+#define JOIN_CACHE_INCREMENTAL_BIT           1
+#define JOIN_CACHE_HASHED_BIT                2
+#define JOIN_CACHE_BKA_BIT                   4
+
+/* 
+  Categories of data fields of variable length written into join cache buffers.
+  The value of any of these fields is written into cache together with the
+  prepended length of the value.     
+*/
+#define CACHE_BLOB      1        /* blob field  */
+#define CACHE_STRIPPED  2        /* field stripped of trailing spaces */
+#define CACHE_VARSTR1   3        /* short string value (length takes 1 byte) */ 
+#define CACHE_VARSTR2   4        /* long string value (length takes 2 bytes) */
+#define CACHE_ROWID     5        /* ROWID field */
+
+/*
+  The CACHE_FIELD structure used to describe fields of records that
+  are written into a join cache buffer from record buffers and backward.
+*/
+typedef struct st_cache_field {
+  uchar *str;   /**< buffer from/to where the field is to be copied */ 
+  uint length;  /**< maximal number of bytes to be copied from/to str */
+  /* 
+    Field object for the moved field
+    (0 - for a flag field, see JOIN_CACHE::create_flag_fields).
+  */
+  Field *field;
+  uint type;    /**< category of the of the copied field (CACHE_BLOB et al.) */
+  /* 
+    The number of the record offset value for the field in the sequence
+    of offsets placed after the last field of the record. These
+    offset values are used to access fields referred to from other caches.
+    If the value is 0 then no offset for the field is saved in the
+    trailing sequence of offsets.
+  */ 
+  uint referenced_field_no; 
+  /* The remaining structure fields are used as containers for temp values */
+  uint blob_length; /**< length of the blob to be copied */
+  uint offset;      /**< field offset to be saved in cache buffer */
+} CACHE_FIELD;
+
+
+class JOIN_TAB_SCAN;
+
+
+/*
+  JOIN_CACHE is the base class to support the implementations of 
+  - Block Nested Loop (BNL) Join Algorithm,
+  - Block Nested Loop Hash (BNLH) Join Algorithm,
+  - Batched Key Access (BKA) Join Algorithm.
+  The first algorithm is supported by the derived class JOIN_CACHE_BNL,
+  the second algorithm is supported by the derived class JOIN_CACHE_BNLH,
+  while the third algorithm is implemented in two variant supported by
+  the classes JOIN_CACHE_BKA and JOIN_CACHE_BKAH.
+  These three algorithms have a lot in common. Each of them first accumulates
+  the records of the left join operand in a join buffer and then searches for
+  matching rows of the second operand for all accumulated records.
+  For the first two algorithms this strategy saves on logical I/O operations:
+  the entire set of records from the join buffer requires only one look-through
+  of the records provided by the second operand. 
+  For the third algorithm the accumulation of records allows to optimize
+  fetching rows of the second operand from disk for some engines (MyISAM, 
+  InnoDB), or to minimize the number of round-trips between the Server and
+  the engine nodes (NDB Cluster).        
+*/ 
+
+class JOIN_CACHE :public Sql_alloc
+{
+
+private:
+
+  /* Size of the offset of a record from the cache */   
+  uint size_of_rec_ofs;    
+  /* Size of the length of a record in the cache */
+  uint size_of_rec_len;
+  /* Size of the offset of a field within a record in the cache */   
+  uint size_of_fld_ofs;
+
+protected:
+       
+  /* 3 functions below actually do not use the hidden parameter 'this' */ 
+
+  /* Calculate the number of bytes used to store an offset value */
+  uint offset_size(uint len)
+  { return (len < 256 ? 1 : len < 256*256 ? 2 : 4); }
+
+  /* Get the offset value that takes ofs_sz bytes at the position ptr */
+  ulong get_offset(uint ofs_sz, uchar *ptr)
+  {
+    switch (ofs_sz) {
+    case 1: return uint(*ptr);
+    case 2: return uint2korr(ptr);
+    case 4: return uint4korr(ptr);
+    }
+    return 0;
+  }
+
+  /* Set the offset value ofs that takes ofs_sz bytes at the position ptr */ 
+  void store_offset(uint ofs_sz, uchar *ptr, ulong ofs)
+  {
+    switch (ofs_sz) {
+    case 1: *ptr= (uchar) ofs; return;
+    case 2: int2store(ptr, (uint16) ofs); return;
+    case 4: int4store(ptr, (uint32) ofs); return;
+    }
+  }
+  
+  /* 
+    The maximum total length of the fields stored for a record in the cache.
+    For blob fields only the sizes of the blob lengths are taken into account. 
+  */
+  uint length;
+
+  /* 
+    Representation of the executed multi-way join through which all needed
+    context can be accessed.  
+  */   
+  JOIN *join;  
+
+  /*
+    JOIN_TAB of the first table that can have it's fields in the join cache. 
+    That is, tables in the [start_tab, tab) range can have their fields in the
+    join cache. 
+    If a join tab in the range represents an SJM-nest, then all tables from the
+    nest can have their fields in the join cache, too.
+  */
+  JOIN_TAB *start_tab;
+
+  /* 
+    The total number of flag and data fields that can appear in a record
+    written into the cache. Fields with null values are always skipped 
+    to save space. 
+  */
+  uint fields;
+
+  /* 
+    The total number of flag fields in a record put into the cache. They are
+    used for table null bitmaps, table null row flags, and an optional match
+    flag. Flag fields go before other fields in a cache record with the match
+    flag field placed always at the very beginning of the record.
+  */
+  uint flag_fields;
+
+  /* The total number of blob fields that are written into the cache */ 
+  uint blobs;
+
+  /* 
+    The total number of fields referenced from field descriptors for other join
+    caches. These fields are used to construct key values.
+    When BKA join algorithm is employed the constructed key values serve to
+    access matching rows with index lookups.
+    The key values are put into a hash table when the BNLH join algorithm
+    is employed and when BKAH is used for the join operation. 
+  */   
+  uint referenced_fields;
+   
+  /* 
+    The current number of already created data field descriptors.
+    This number can be useful for implementations of the init methods.  
+  */
+  uint data_field_count; 
+
+  /* 
+    The current number of already created pointers to the data field
+    descriptors. This number can be useful for implementations of
+    the init methods.  
+  */
+  uint data_field_ptr_count;
+ 
+  /* 
+    Array of the descriptors of fields containing 'fields' elements.
+    These are all fields that are stored for a record in the cache. 
+  */
+  CACHE_FIELD *field_descr;
+
+  /* 
+    Array of pointers to the blob descriptors that contains 'blobs' elements.
+  */
+  CACHE_FIELD **blob_ptr;
+
+  /* 
+    This flag indicates that records written into the join buffer contain
+    a match flag field. The flag must be set by the init method. 
+  */
+  bool with_match_flag; 
+  /*
+    This flag indicates that any record is prepended with the length of the
+    record which allows us to skip the record or part of it without reading.
+  */
+  bool with_length;
+
+  /* 
+    The maximal number of bytes used for a record representation in
+    the cache excluding the space for blob data. 
+    For future derived classes this representation may contains some
+    redundant info such as a key value associated with the record.     
+  */
+  uint pack_length;
+  /* 
+    The value of pack_length incremented by the total size of all 
+    pointers of a record in the cache to the blob data. 
+  */
+  uint pack_length_with_blob_ptrs;
+
+  /* 
+    The total size of the record base prefix. The base prefix of record may
+    include the following components:
+     - the length of the record
+     - the link to a record in a previous buffer.
+    Each record in the buffer are supplied with the same set of the components.
+  */
+  uint base_prefix_length;
+
+  /*
+    The expected length of a record in the join buffer together with     
+    all prefixes and postfixes
+  */
+  size_t avg_record_length;
+
+  /* The expected size of the space per record in the auxiliary buffer */
+  size_t avg_aux_buffer_incr;
+
+  /* Expected join buffer space used for one record */
+  size_t space_per_record; 
+
+  /* Pointer to the beginning of the join buffer */
+  uchar *buff;         
+  /* 
+    Size of the entire memory allocated for the join buffer.
+    Part of this memory may be reserved for the auxiliary buffer.
+  */ 
+  size_t buff_size;
+  /* The minimal join buffer size when join buffer still makes sense to use */
+  size_t min_buff_size;
+  /* The maximum expected size if the join buffer to be used */
+  size_t max_buff_size;
+  /* Size of the auxiliary buffer */ 
+  size_t aux_buff_size;
+
+  /* The number of records put into the join buffer */ 
+  size_t records;
+  /* 
+    The number of records in the fully refilled join buffer of
+    the minimal size equal to min_buff_size
+  */
+  size_t min_records;
+  /*
+    The maximum expected number of records to be put in the join buffer
+    at one refill 
+  */
+  size_t max_records;
+
+  /* 
+    Pointer to the current position in the join buffer.
+    This member is used both when writing to buffer and
+    when reading from it.
+  */
+  uchar *pos;
+  /* 
+    Pointer to the first free position in the join buffer,
+    right after the last record into it.
+  */
+  uchar *end_pos; 
+
+  /* 
+    Pointer to the beginning of the first field of the current read/write
+    record from the join buffer. The value is adjusted by the 
+    get_record/put_record functions.
+  */
+  uchar *curr_rec_pos;
+  /* 
+    Pointer to the beginning of the first field of the last record
+    from the join buffer.
+  */
+  uchar *last_rec_pos;
+
+  /* 
+    Flag is set if the blob data for the last record in the join buffer
+    is in record buffers rather than in the join cache.
+  */
+  bool last_rec_blob_data_is_in_rec_buff;
+
+  /* 
+    Pointer to the position to the current record link. 
+    Record links are used only with linked caches. Record links allow to set
+    connections between parts of one join record that are stored in different
+    join buffers.
+    In the simplest case a record link is just a pointer to the beginning of
+    the record stored in the buffer.
+    In a more general case a link could be a reference to an array of pointers
+    to records in the buffer.
+  */
+  uchar *curr_rec_link;
+
+  /* 
+    This flag is set to TRUE if join_tab is the first inner table of an outer
+    join and  the latest record written to the join buffer is detected to be
+    null complemented after checking on conditions over the outer tables for
+    this outer join operation
+  */ 
+  bool last_written_is_null_compl;
+
+  /*
+    The number of fields put in the join buffer of the join cache that are
+    used in building keys to access the table join_tab
+  */
+  uint local_key_arg_fields;
+  /* 
+    The total number of the fields in the previous caches that are used
+    in building keys to access the table join_tab
+  */
+  uint external_key_arg_fields;
+
+  /* 
+    This flag indicates that the key values will be read directly from the join
+    buffer. It will save us building key values in the key buffer.
+  */
+  bool use_emb_key;
+  /* The length of an embedded key value */ 
+  uint emb_key_length;
+
+  /*
+    This object provides the methods to iterate over records of
+    the joined table join_tab when looking for join matches between
+    records from join buffer and records from join_tab.
+    BNL and BNLH join algorithms retrieve all records from join_tab,
+    while BKA/BKAH algorithm iterates only over those records from
+    join_tab that can be accessed by look-ups with join keys built
+    from records in join buffer.  
+  */
+  JOIN_TAB_SCAN *join_tab_scan;
+
+  void calc_record_fields();     
+  void collect_info_on_key_args();
+  int alloc_fields();
+  void create_flag_fields();
+  void create_key_arg_fields();
+  void create_remaining_fields();
+  void set_constants();
+  int alloc_buffer();
+
+  /* Shall reallocate the join buffer */
+  virtual int realloc_buffer();
+  
+  /* Check the possibility to read the access keys directly from join buffer */ 
+  bool check_emb_key_usage();
+
+  uint get_size_of_rec_offset() { return size_of_rec_ofs; }
+  uint get_size_of_rec_length() { return size_of_rec_len; }
+  uint get_size_of_fld_offset() { return size_of_fld_ofs; }
+
+  uchar *get_rec_ref(uchar *ptr)
+  {
+    return buff+get_offset(size_of_rec_ofs, ptr-size_of_rec_ofs);
+  }
+  ulong get_rec_length(uchar *ptr)
+  { 
+    return (ulong) get_offset(size_of_rec_len, ptr);
+  }
+  ulong get_fld_offset(uchar *ptr)
+  { 
+    return (ulong) get_offset(size_of_fld_ofs, ptr);
+  }
+
+  void store_rec_ref(uchar *ptr, uchar* ref)
+  {
+    store_offset(size_of_rec_ofs, ptr-size_of_rec_ofs, (ulong) (ref-buff));
+  }
+  void store_rec_length(uchar *ptr, ulong len)
+  {
+    store_offset(size_of_rec_len, ptr, len);
+  }
+  void store_fld_offset(uchar *ptr, ulong ofs)
+  {
+    store_offset(size_of_fld_ofs, ptr, ofs);
+  }
+
+  /* Write record fields and their required offsets into the join buffer */ 
+  uint write_record_data(uchar *link, bool *is_full);
+
+  /* Get the total length of all prefixes of a record in the join buffer */ 
+  virtual uint get_prefix_length() { return base_prefix_length; }
+  /* Get maximum total length of all affixes of a record in the join buffer */
+  virtual uint get_record_max_affix_length(); 
+
+  /* 
+    Shall get maximum size of the additional space per record used for
+    record keys
+  */
+  virtual uint get_max_key_addon_space_per_record() { return 0; }
+
+  /* 
+    This method must determine for how much the auxiliary buffer should be
+    incremented when a new record is added to the join buffer.
+    If no auxiliary buffer is needed the function should return 0.
+  */
+  virtual uint aux_buffer_incr(ulong recno);
+
+  /* Shall calculate how much space is remaining in the join buffer */ 
+  virtual size_t rem_space() 
+  { 
+    return max(buff_size-(end_pos-buff)-aux_buff_size,0);
+  }
+
+  /* 
+    Shall calculate how much space is taken by allocation of the key
+    for a record in the join buffer
+  */
+  virtual uint extra_key_length() { return 0; }
+
+  /*  Read all flag and data fields of a record from the join buffer */
+  uint read_all_record_fields();
+  
+  /* Read all flag fields of a record from the join buffer */
+  uint read_flag_fields();
+
+  /* Read a data record field from the join buffer */
+  uint read_record_field(CACHE_FIELD *copy, bool last_record);
+
+  /* Read a referenced field from the join buffer */
+  bool read_referenced_field(CACHE_FIELD *copy, uchar *rec_ptr, uint *len);
+
+  /* 
+    Shall skip record from the join buffer if its match flag
+    is set to MATCH_FOUND
+ */
+  virtual bool skip_if_matched();
+
+  /* 
+    Shall skip record from the join buffer if its match flag
+    commands to do so
+  */
+  virtual bool skip_if_not_needed_match();
+
+  /* 
+    True if rec_ptr points to the record whose blob data stay in
+    record buffers
+  */
+  bool blob_data_is_in_rec_buff(uchar *rec_ptr)
+  {
+    return rec_ptr == last_rec_pos && last_rec_blob_data_is_in_rec_buff;
+  }
+
+  /* Find matches from the next table for records from the join buffer */
+  virtual enum_nested_loop_state join_matching_records(bool skip_last);
+
+  /* Shall set an auxiliary buffer up (currently used only by BKA joins) */
+  virtual int setup_aux_buffer(HANDLER_BUFFER &aux_buff) 
+  {
+    DBUG_ASSERT(0);
+    return 0;
+  }
+
+  /*
+    Shall get the number of ranges in the cache buffer passed
+    to the MRR interface
+  */  
+  virtual uint get_number_of_ranges_for_mrr() { return 0; };
+
+  /* 
+    Shall prepare to look for records from the join cache buffer that would
+    match the record of the joined table read into the record buffer
+  */ 
+  virtual bool prepare_look_for_matches(bool skip_last)= 0;
+  /* 
+    Shall return a pointer to the record from join buffer that is checked
+    as the next candidate for a match with the current record from join_tab.
+    Each implementation of this virtual function should bare in mind
+    that the record position it returns shall be exactly the position
+    passed as the parameter to the implementations of the virtual functions 
+    skip_next_candidate_for_match and read_next_candidate_for_match.
+  */   
+  virtual uchar *get_next_candidate_for_match()= 0;
+  /*
+    Shall check whether the given record from the join buffer has its match
+    flag settings commands to skip the record in the buffer.
+  */
+  virtual bool skip_next_candidate_for_match(uchar *rec_ptr)= 0;
+  /*
+    Shall read the given record from the join buffer into the
+    the corresponding record buffer
+  */
+  virtual void read_next_candidate_for_match(uchar *rec_ptr)= 0;
+
+  /* 
+    Shall return the location of the association label returned by 
+    the multi_read_range_next function for the current record loaded
+    into join_tab's record buffer
+  */
+  virtual uchar **get_curr_association_ptr() { return 0; };
+
+  /* Add null complements for unmatched outer records from the join buffer */
+  virtual enum_nested_loop_state join_null_complements(bool skip_last);
+
+  /* Restore the fields of the last record from the join buffer */
+  virtual void restore_last_record();
+
+  /* Set match flag for a record in join buffer if it has not been set yet */
+  bool set_match_flag_if_none(JOIN_TAB *first_inner, uchar *rec_ptr);
+
+  enum_nested_loop_state generate_full_extensions(uchar *rec_ptr);
+
+  /* Check matching to a partial join record from the join buffer */
+  bool check_match(uchar *rec_ptr);
+
+  /* 
+    This constructor creates an unlinked join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE(JOIN *j, JOIN_TAB *tab)
+  {
+    join= j;
+    join_tab= tab;
+    prev_cache= next_cache= 0;
+    buff= 0;
+  }
+
+  /* 
+    This constructor creates a linked join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev)   
+  {  
+    join= j;
+    join_tab= tab;
+    next_cache= 0;
+    prev_cache= prev;
+    buff= 0;
+    if (prev)
+      prev->next_cache= this;
+  }
+
+public:
+ 
+  /*
+    The enumeration type Join_algorithm includes a mnemonic constant for
+    each join algorithm that employs join buffers
+  */
+
+  enum Join_algorithm
+  { 
+    BNL_JOIN_ALG,     /* Block Nested Loop Join algorithm                  */
+    BNLH_JOIN_ALG,    /* Block Nested Loop Hash Join algorithm             */
+    BKA_JOIN_ALG,     /* Batched Key Access Join algorithm                 */
+    BKAH_JOIN_ALG,    /* Batched Key Access with Hash Table Join Algorithm */
+  };
+
+  /* 
+    The enumeration type Match_flag describes possible states of the match flag
+    field  stored for the records of the first inner tables of outer joins and
+    semi-joins in the cases when the first match strategy is used for them.
+    When a record with match flag field is written into the join buffer the
+    state of the field usually is MATCH_NOT_FOUND unless this is a record of the
+    first inner table of the outer join for which the on precondition (the
+    condition from on expression over outer tables)  has turned out not to be 
+    true. In the last case the state of the match flag is MATCH_IMPOSSIBLE.
+    The state of the match flag field is changed to MATCH_FOUND as soon as
+    the first full matching combination of inner tables of the outer join or
+    the semi-join is discovered. 
+  */
+  enum Match_flag { MATCH_NOT_FOUND, MATCH_FOUND, MATCH_IMPOSSIBLE };
+
+  /* Table to be joined with the partial join records from the cache */ 
+  JOIN_TAB *join_tab;
+
+  /* Pointer to the previous join cache if there is any */
+  JOIN_CACHE *prev_cache;
+  /* Pointer to the next join cache if there is any */
+  JOIN_CACHE *next_cache;
+
+  /* Shall initialize the join cache structure */ 
+  virtual int init();
+
+  /* Get the current size of the cache join buffer */ 
+  size_t get_join_buffer_size() { return buff_size; }
+  /* Set the size of the cache join buffer to a new value */
+  void set_join_buffer_size(size_t sz) { buff_size= sz; }
+
+  /* Get the minimum possible size of the cache join buffer */
+  virtual ulong get_min_join_buffer_size();
+  /* Get the maximum possible size of the cache join buffer */ 
+  virtual ulong get_max_join_buffer_size(bool optimize_buff_size);
+
+  /* Shrink the size if the cache join buffer in a given ratio */
+  bool shrink_join_buffer_in_ratio(ulonglong n, ulonglong d);
+
+  /*  Shall return the type of the employed join algorithm */
+  virtual enum Join_algorithm get_join_alg()= 0;
+
+  /* 
+    The function shall return TRUE only when there is a key access
+    to the join table
+  */
+  virtual bool is_key_access()= 0;
+
+  /* Shall reset the join buffer for reading/writing */
+  virtual void reset(bool for_writing);
+
+  /* 
+    This function shall add a record into the join buffer and return TRUE
+    if it has been decided that it should be the last record in the buffer.
+  */ 
+  virtual bool put_record();
+
+  /* 
+    This function shall read the next record into the join buffer and return
+    TRUE if there is no more next records.
+  */ 
+  virtual bool get_record();
+
+  /* 
+    This function shall read the record at the position rec_ptr
+    in the join buffer
+  */ 
+  virtual void get_record_by_pos(uchar *rec_ptr);
+
+  /* Shall return the value of the match flag for the positioned record */
+  virtual enum Match_flag get_match_flag_by_pos(uchar *rec_ptr);
+
+  /* Shall return the position of the current record */
+  virtual uchar *get_curr_rec() { return curr_rec_pos; }
+
+  /* Shall set the current record link */
+  virtual void set_curr_rec_link(uchar *link) { curr_rec_link= link; }
+
+  /* Shall return the current record link */
+  virtual uchar *get_curr_rec_link()
+  { 
+    return (curr_rec_link ? curr_rec_link : get_curr_rec());
+  }
+     
+  /* Join records from the join buffer with records from the next join table */ 
+  enum_nested_loop_state join_records(bool skip_last);
+
+  /* Add a comment on the join algorithm employed by the join cache */
+  virtual void print_explain_comment(String *str);
+
+  virtual ~JOIN_CACHE() {}
+  void reset_join(JOIN *j) { join= j; }
+  void free()
+  { 
+    my_free(buff);
+    buff= 0;
+  }   
+  
+  friend class JOIN_CACHE_HASHED;
+  friend class JOIN_CACHE_BNL;
+  friend class JOIN_CACHE_BKA;
+  friend class JOIN_TAB_SCAN;
+  friend class JOIN_TAB_SCAN_MRR;
+
+};
+
+
+/*
+  The class JOIN_CACHE_HASHED is the base class for the classes
+  JOIN_CACHE_HASHED_BNL and JOIN_CACHE_HASHED_BKA. The first of them supports
+  an implementation of Block Nested Loop Hash (BNLH) Join Algorithm,
+  while the second is used for a variant of the BKA Join algorithm that performs
+  only one lookup for any records from join buffer with the same key value. 
+  For a join cache of this class the records from the join buffer that have
+  the same access key are linked into a chain attached to a key entry structure
+  that either itself contains the key value, or, in the case when the keys are
+  embedded, refers to its occurrence in one of the records from the chain.
+  To build the chains with the same keys a hash table is employed. It is placed
+  at the very end of the join buffer. The array of hash entries is allocated
+  first at the very bottom of the join buffer, while key entries are placed
+  before this array.
+  A hash entry contains a header of the list of the key entries with the same
+  hash value. 
+  Each key entry is a structure of the following type:
+    struct st_join_cache_key_entry {
+      union { 
+        uchar[] value;
+        cache_ref *value_ref; // offset from the beginning of the buffer
+      } hash_table_key;
+      key_ref next_key; // offset backward from the beginning of hash table
+      cache_ref *last_rec // offset from the beginning of the buffer
+    }
+  The references linking the records in a chain are always placed at the very
+  beginning of the record info stored in the join buffer. The records are 
+  linked in a circular list. A new record is always added to the end of this 
+  list.
+
+  The following picture represents a typical layout for the info stored in the
+  join buffer of a join cache object of the JOIN_CACHE_HASHED class.
+    
+  buff
+  V
+  +----------------------------------------------------------------------------+
+  |     |[*]record_1_1|                                                        |
+  |     ^ |                                                                    |
+  |     | +--------------------------------------------------+                 |
+  |     |                           |[*]record_2_1|          |                 |
+  |     |                           ^ |                      V                 |
+  |     |                           | +------------------+   |[*]record_1_2|   |
+  |     |                           +--------------------+-+   |               |
+  |+--+ +---------------------+                          | |   +-------------+ |
+  ||  |                       |                          V |                 | |
+  |||[*]record_3_1|         |[*]record_1_3|              |[*]record_2_2|     | |
+  ||^                       ^                            ^                   | |
+  ||+----------+            |                            |                   | |
+  ||^          |            |<---------------------------+-------------------+ |
+  |++          | | ... mrr  |   buffer ...           ... |     |               |
+  |            |            |                            |                     |
+  |      +-----+--------+   |                      +-----|-------+             |
+  |      V     |        |   |                      V     |       |             |
+  ||key_3|[/]|[*]|      |   |                |key_2|[/]|[*]|     |             |
+  |                   +-+---|-----------------------+            |             |
+  |                   V |   |                       |            |             |
+  |             |key_1|[*]|[*]|         |   | ... |[*]|   ...  |[*]|  ...  |   |
+  +----------------------------------------------------------------------------+
+                                        ^           ^            ^
+                                        |           i-th entry   j-th entry
+                                        hash table
+
+  i-th hash entry:
+    circular record chain for key_1:
+      record_1_1
+      record_1_2
+      record_1_3 (points to record_1_1)
+    circular record chain for key_3:
+      record_3_1 (points to itself)
+
+  j-th hash entry:
+    circular record chain for key_2:
+      record_2_1
+      record_2_2 (points to record_2_1)
+
+*/
+
+class JOIN_CACHE_HASHED: public JOIN_CACHE
+{
+
+  typedef uint (JOIN_CACHE_HASHED::*Hash_func) (uchar *key, uint key_len);
+  typedef bool (JOIN_CACHE_HASHED::*Hash_cmp_func) (uchar *key1, uchar *key2,
+                                                    uint key_len);
+  
+private:
+
+  /* Size of the offset of a key entry in the hash table */
+  uint size_of_key_ofs;
+
+  /* 
+    Length of the key entry in the hash table.
+    A key entry either contains the key value, or it contains a reference
+    to the key value if use_emb_key flag is set for the cache.
+  */ 
+  uint key_entry_length;
+ 
+  /* The beginning of the hash table in the join buffer */
+  uchar *hash_table;
+  /* Number of hash entries in the hash table */
+  uint hash_entries;
+
+
+  /* The position of the currently retrieved key entry in the hash table */
+  uchar *curr_key_entry;
+
+  /* The offset of the data fields from the beginning of the record fields */
+  uint data_fields_offset;
+
+  inline uint get_hash_idx_simple(uchar *key, uint key_len);
+  inline uint get_hash_idx_complex(uchar *key, uint key_len);
+
+  inline bool equal_keys_simple(uchar *key1, uchar *key2, uint key_len);
+  inline bool equal_keys_complex(uchar *key1, uchar *key2, uint key_len);
+
+  int init_hash_table();
+  void cleanup_hash_table();
+  
+protected:
+
+  /* 
+    Index info on the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  KEY *ref_key_info;
+  /* 
+    Number of the key parts the TABLE_REF object used by the hash join
+    to look for matching records
+  */    
+  uint ref_used_key_parts;
+
+  /*
+    The hash function used in the hash table,
+    usually set by the init() method
+  */ 
+  Hash_func hash_func;
+  /*
+    The function to check whether two key entries in the hash table
+    are equal or not, usually set by the init() method
+  */ 
+  Hash_cmp_func hash_cmp_func;
+
+  /* 
+    Length of a key value.
+    It is assumed that all key values have the same length.
+  */
+  uint key_length;
+  /* Buffer to store key values for probing */
+  uchar *key_buff;
+
+  /* Number of key entries in the hash table (number of distinct keys) */
+  uint key_entries;
+
+  /* The position of the last key entry in the hash table */
+  uchar *last_key_entry;
+
+  /* 
+    The offset of the record fields from the beginning of the record
+    representation. The record representation starts with a reference to
+    the next record in the key record chain followed by the length of
+    the trailing record data followed by a reference to the record segment
+    in the previous cache, if any, followed by the record fields.
+  */ 
+  uint rec_fields_offset;
+
+  uint get_size_of_key_offset() { return size_of_key_ofs; }
+
+  /* 
+    Get the position of the next_key_ptr field pointed to by 
+    a linking reference stored at the position key_ref_ptr. 
+    This reference is actually the offset backward from the
+    beginning of hash table.
+  */  
+  uchar *get_next_key_ref(uchar *key_ref_ptr)
+  {
+    return hash_table-get_offset(size_of_key_ofs, key_ref_ptr);
+  }
+
+  /* 
+    Store the linking reference to the next_key_ptr field at 
+    the position key_ref_ptr. The position of the next_key_ptr
+    field is pointed to by ref. The stored reference is actually
+    the offset backward from the beginning of the hash table.
+  */  
+  void store_next_key_ref(uchar *key_ref_ptr, uchar *ref)
+  {
+    store_offset(size_of_key_ofs, key_ref_ptr, (ulong) (hash_table-ref));
+  }     
+  
+  /* 
+    Check whether the reference to the next_key_ptr field at the position
+    key_ref_ptr contains  a nil value.
+  */
+  bool is_null_key_ref(uchar *key_ref_ptr)
+  {
+    ulong nil= 0;
+    return memcmp(key_ref_ptr, &nil, size_of_key_ofs ) == 0;
+  } 
+
+  /* 
+    Set the reference to the next_key_ptr field at the position
+    key_ref_ptr equal to nil.
+  */
+  void store_null_key_ref(uchar *key_ref_ptr)
+  {
+    ulong nil= 0;
+    store_offset(size_of_key_ofs, key_ref_ptr, nil);
+  } 
+
+  uchar *get_next_rec_ref(uchar *ref_ptr)
+  {
+    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
+  }
+
+  void store_next_rec_ref(uchar *ref_ptr, uchar *ref)
+  {
+    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
+  } 
+
+  /*
+    Get the position of the embedded key value for the current
+    record pointed to by get_curr_rec().
+  */ 
+  uchar *get_curr_emb_key()
+  {
+    return get_curr_rec()+data_fields_offset;
+  }
+
+  /*
+    Get the position of the embedded key value pointed to by a reference
+    stored at ref_ptr. The stored reference is actually the offset from
+    the beginning of the join buffer.
+  */  
+  uchar *get_emb_key(uchar *ref_ptr)
+  {
+    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
+  }
+
+  /* 
+    Store the reference to an embedded key at the position key_ref_ptr.
+    The position of the embedded key is pointed to by ref. The stored
+    reference is actually the offset from the beginning of the join buffer.
+  */  
+  void store_emb_key_ref(uchar *ref_ptr, uchar *ref)
+  {
+    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
+  }
+  
+  /* Get the total length of all prefixes of a record in hashed join buffer */ 
+  uint get_prefix_length() 
+  { 
+    return base_prefix_length + get_size_of_rec_offset();
+  }
+
+  /* 
+    Get maximum size of the additional space per record used for
+    the hash table with record keys
+  */
+  uint get_max_key_addon_space_per_record();
+
+  /* 
+    Calculate how much space in the buffer would not be occupied by
+    records, key entries and additional memory for the MMR buffer.
+  */ 
+  size_t rem_space() 
+  { 
+    return max(last_key_entry-end_pos-aux_buff_size,0);
+  }
+
+  /* 
+    Calculate how much space is taken by allocation of the key
+    entry for a record in the join buffer
+  */
+  uint extra_key_length() { return key_entry_length; }
+
+  /* 
+    Skip record from a hashed join buffer if its match flag
+    is set to MATCH_FOUND
+  */
+  bool skip_if_matched();
+
+  /*
+    Skip record from a hashed join buffer if its match flag setting 
+    commands to do so
+  */
+  bool skip_if_not_needed_match();
+
+  /* Search for a key in the hash table of the join buffer */
+  bool key_search(uchar *key, uint key_len, uchar **key_ref_ptr);
+
+  /* Reallocate the join buffer of a hashed join cache */
+  int realloc_buffer();
+
+  /* 
+    This constructor creates an unlinked hashed join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_HASHED(JOIN *j, JOIN_TAB *tab) :JOIN_CACHE(j, tab) {}
+
+  /* 
+    This constructor creates a linked hashed join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_HASHED(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+		    :JOIN_CACHE(j, tab, prev) {}
+
+public:
+
+  /* Initialize a hashed join cache */       
+  int init();
+
+  /* Reset the buffer of a hashed join cache for reading/writing */
+  void reset(bool for_writing);
+
+  /* Add a record into the buffer of a hashed join cache */
+  bool put_record();
+
+  /* Read the next record from the buffer of a hashed join cache */
+  bool get_record();
+
+  /*
+    Shall check whether all records in a key chain have 
+    their match flags set on
+  */   
+  virtual bool check_all_match_flags_for_key(uchar *key_chain_ptr);
+
+  uint get_next_key(uchar **key); 
+  
+  /* Get the head of the record chain attached to the current key entry */ 
+  uchar *get_curr_key_chain()
+  {
+    return get_next_rec_ref(curr_key_entry+key_entry_length-
+                            get_size_of_rec_offset());
+  }
+  
+};
+
+
+/*
+  The class JOIN_TAB_SCAN is a companion class for the classes JOIN_CACHE_BNL
+  and JOIN_CACHE_BNLH. Actually the class implements the iterator over the
+  table joinded by BNL/BNLH join algorithm.
+  The virtual functions open, next and close are called for any iteration over
+  the table. The function open is called to initiate the process of the 
+  iteration. The function next shall read the next record from the joined
+  table. The record is read into the record buffer of the joined table.
+  The record is to be matched with records from the join cache buffer. 
+  The function close shall perform the finalizing actions for the iteration.
+*/
+   
+class JOIN_TAB_SCAN: public Sql_alloc
+{
+
+private:
+  /* TRUE if this is the first record from the joined table to iterate over */
+  bool is_first_record;
+
+protected:
+
+  /* The joined table to be iterated over */
+  JOIN_TAB *join_tab;
+  /* The join cache used to join the table join_tab */ 
+  JOIN_CACHE *cache;
+  /* 
+    Representation of the executed multi-way join through which
+    all needed context can be accessed.  
+  */   
+  JOIN *join;
+
+public:
+  
+  JOIN_TAB_SCAN(JOIN *j, JOIN_TAB *tab)
+  {
+    join= j;
+    join_tab= tab;
+    cache= join_tab->cache;
+  }
+
+  virtual ~JOIN_TAB_SCAN() {}
+ 
+  /* 
+    Shall calculate the increment of the auxiliary buffer for a record
+    write if such a buffer is used by the table scan object 
+  */
+  virtual uint aux_buffer_incr(ulong recno) { return 0; }
+
+  /* Initiate the process of iteration over the joined table */
+  virtual int open();
+  /* 
+    Shall read the next candidate for matches with records from 
+    the join buffer.
+  */
+  virtual int next();
+  /* 
+    Perform the finalizing actions for the process of iteration
+    over the joined_table.
+  */ 
+  virtual void close();
+
+};
+
+/*
+  The class JOIN_CACHE_BNL is used when the BNL join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BNL :public JOIN_CACHE
+{
+private:
+  /* 
+    The number of the records in the join buffer that have to be
+    checked yet for a match with the current record of join_tab 
+    read into the record buffer.
+  */
+  uint rem_records;
+
+protected:
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BNL join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab) :JOIN_CACHE(j, tab) {}
+
+  /* 
+    This constructor creates a linked BNL join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+    :JOIN_CACHE(j, tab, prev) {}
+
+  /* Initialize the BNL cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BNL_JOIN_ALG; }
+
+  bool is_key_access() { return FALSE; }
+
+};
+
+
+/*
+  The class JOIN_CACHE_BNLH is used when the BNLH join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BNLH :public JOIN_CACHE_HASHED
+{
+
+protected:
+
+  /* 
+    The pointer to the last record from the circular list of the records
+    that  match the join key built out of the record in the join buffer for
+    the join_tab table
+  */
+  uchar *last_matching_rec_ref_ptr;
+  /*
+    The pointer to the next current  record from the circular list of the
+    records that match the join key built out of the record in the join buffer
+    for the join_tab table. This pointer is used by the class method 
+    get_next_candidate_for_match to iterate over records from the circular
+    list.
+  */
+  uchar *next_matching_rec_ref_ptr;
+
+  /*
+    Get the chain of records from buffer matching the current candidate
+    record for join
+  */
+  uchar *get_matching_chain_by_join_key();
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BNLH join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+  */   
+  JOIN_CACHE_BNLH(JOIN *j, JOIN_TAB *tab) : JOIN_CACHE_HASHED(j, tab) {}
+
+  /* 
+    This constructor creates a linked BNLH join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+  */   
+  JOIN_CACHE_BNLH(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev) 
+    : JOIN_CACHE_HASHED(j, tab, prev) {}
+
+  /* Initialize the BNLH cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BNLH_JOIN_ALG; }
+
+  bool is_key_access() { return TRUE; }
+
+};
+
+
+/*
+  The class JOIN_TAB_SCAN_MRR is a companion class for the classes
+  JOIN_CACHE_BKA and JOIN_CACHE_BKAH. Actually the class implements the
+  iterator over the records from join_tab selected by BKA/BKAH join
+  algorithm as the candidates to be joined. 
+  The virtual functions open, next and close are called for any iteration over
+  join_tab record candidates. The function open is called to initiate the
+  process of the iteration. The function next shall read the next record from
+  the set of the record candidates. The record is read into the record buffer
+  of the joined table. The function close shall perform the finalizing actions
+  for the iteration.
+*/
+   
+class JOIN_TAB_SCAN_MRR: public JOIN_TAB_SCAN
+{
+  /* Interface object to generate key ranges for MRR */
+  RANGE_SEQ_IF range_seq_funcs;
+
+  /* Number of ranges to be processed by the MRR interface */
+  uint ranges;
+
+  /* Flag to to be passed to the MRR interface */ 
+  uint mrr_mode;
+
+  /* MRR buffer assotiated with this join cache */
+  HANDLER_BUFFER mrr_buff;
+
+  /* Shall initialize the MRR buffer */
+  virtual void init_mrr_buff()
+  {
+    cache->setup_aux_buffer(mrr_buff);
+  }
+
+public:
+
+  JOIN_TAB_SCAN_MRR(JOIN *j, JOIN_TAB *tab, uint flags, RANGE_SEQ_IF rs_funcs)
+    :JOIN_TAB_SCAN(j, tab), range_seq_funcs(rs_funcs), mrr_mode(flags) {}
+
+  uint aux_buffer_incr(ulong recno);
+
+  int open();
+ 
+  int next();
+
+  friend class JOIN_CACHE_BKA; /* it needs to add an mrr_mode flag after JOIN_CACHE::init() call */
+};
+
+/*
+  The class JOIN_CACHE_BKA is used when the BKA join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BKA :public JOIN_CACHE
+{
+private:
+
+  /* Flag to to be passed to the companion JOIN_TAB_SCAN_MRR object */
+  uint mrr_mode;
+
+  /* 
+    This value is set to 1 by the class prepare_look_for_matches method
+    and back to 0 by the class get_next_candidate_for_match method
+  */
+  uint rem_records;
+
+  /*
+    This field contains the current association label set by a call of
+    the multi_range_read_next handler function.
+    See the function JOIN_CACHE_BKA::get_curr_key_association()
+  */
+  uchar *curr_association;
+
+protected:
+
+  /* 
+    Get the number of ranges in the cache buffer passed to the MRR
+    interface. For each record its own range is passed.
+  */
+  uint get_number_of_ranges_for_mrr() { return (uint)records; }
+
+ /*
+   Setup the MRR buffer as the space between the last record put
+   into the join buffer and the very end of the join buffer 
+ */
+  int setup_aux_buffer(HANDLER_BUFFER &aux_buff)
+  {
+    aux_buff.buffer= end_pos;
+    aux_buff.buffer_end= buff+buff_size;
+    return 0;
+  }
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  uchar *get_next_candidate_for_match();
+
+  bool skip_next_candidate_for_match(uchar *rec_ptr);
+
+  void read_next_candidate_for_match(uchar *rec_ptr);
+
+public:
+
+  /* 
+    This constructor creates an unlinked BKA join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags)
+    :JOIN_CACHE(j, tab), mrr_mode(flags) {}
+  /* 
+    This constructor creates a linked BKA join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags, JOIN_CACHE *prev)
+    :JOIN_CACHE(j, tab, prev), mrr_mode(flags) {}
+  
+  uchar **get_curr_association_ptr() { return &curr_association; }
+
+  /* Initialize the BKA cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BKA_JOIN_ALG; }
+
+  bool is_key_access() { return TRUE; }
+
+  /* Get the key built over the next record from the join buffer */
+  uint get_next_key(uchar **key);
+
+  /* Check index condition of the joined table for a record from BKA cache */
+  bool skip_index_tuple(range_id_t range_info);
+
+  void print_explain_comment(String *str);
+};
+
+
+
+/*
+  The class JOIN_CACHE_BKAH is used when the BKAH join algorithm is
+  employed to perform a join operation   
+*/
+
+class JOIN_CACHE_BKAH :public JOIN_CACHE_BNLH
+{
+
+private:
+  /* Flag to to be passed to the companion JOIN_TAB_SCAN_MRR object */
+  uint mrr_mode;
+
+  /* 
+    This flag is set to TRUE if the implementation of the MRR interface cannot
+    handle range association labels and does not return them to the caller of
+    the multi_range_read_next handler function. E.g. the implementation of
+    the MRR inteface for the Falcon engine could not return association
+    labels to the caller of multi_range_read_next.
+    The flag is set by JOIN_CACHE_BKA::init() and is not ever changed.
+  */       
+  bool no_association;
+
+  /* 
+    This field contains the association label returned by the 
+    multi_range_read_next function.
+    See the function JOIN_CACHE_BKAH::get_curr_key_association()
+  */
+  uchar *curr_matching_chain;
+
+protected:
+
+  uint get_number_of_ranges_for_mrr() { return key_entries; }
+
+  /* 
+    Initialize the MRR buffer allocating some space within the join buffer.
+    The entire space between the last record put into the join buffer and the
+    last key entry added to the hash table is used for the MRR buffer.
+  */
+  int setup_aux_buffer(HANDLER_BUFFER &aux_buff)
+  {
+    aux_buff.buffer= end_pos;
+    aux_buff.buffer_end= last_key_entry;
+    return 0;
+  }
+
+  bool prepare_look_for_matches(bool skip_last);
+
+  /*
+    The implementations of the methods
+    - get_next_candidate_for_match
+    - skip_recurrent_candidate_for_match
+    - read_next_candidate_for_match
+    are inherited from the JOIN_CACHE_BNLH class
+  */
+
+public:
+
+  /* 
+    This constructor creates an unlinked BKAH join cache. The cache is to be
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKAH(JOIN *j, JOIN_TAB *tab, uint flags) 
+    :JOIN_CACHE_BNLH(j, tab), mrr_mode(flags) {}
+
+  /* 
+    This constructor creates a linked BKAH join cache. The cache is to be 
+    used to join table 'tab' to the result of joining the previous tables 
+    specified by the 'j' parameter. The parameter 'prev' specifies the previous
+    cache object to which this cache is linked.
+    The MRR mode initially is set to 'flags'.
+  */   
+  JOIN_CACHE_BKAH(JOIN *j, JOIN_TAB *tab, uint flags, JOIN_CACHE *prev)
+    :JOIN_CACHE_BNLH(j, tab, prev), mrr_mode(flags)  {}
+
+  uchar **get_curr_association_ptr() { return &curr_matching_chain; }
+
+  /* Initialize the BKAH cache */       
+  int init();
+
+  enum Join_algorithm get_join_alg() { return BKAH_JOIN_ALG; }
+
+  /* Check index condition of the joined table for a record from BKAH cache */
+  bool skip_index_tuple(range_id_t range_info);
+
+  void print_explain_comment(String *str);
+};
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index bd08e6c4f63..984b4c998b4 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -27,6 +27,7 @@
 #include <hash.h>
 #include "sp.h"
 #include "sp_head.h"
+#include "sql_select.h"
 
 static int lex_one_token(void *arg, void *yythd);
 
@@ -180,6 +181,7 @@ init_lex_with_single_table(THD *thd, TABLE *table, LEX *lex)
     return TRUE;
   context->resolve_in_table_list_only(table_list);
   lex->use_only_table_context= TRUE;
+  lex->context_analysis_only|= CONTEXT_ANALYSIS_ONLY_VCOL_EXPR;
   select_lex->cur_pos_in_select_list= UNDEF_POS;
   table->map= 1; //To ensure correct calculation of const item
   table->get_fields_in_item_tree= TRUE;
@@ -466,7 +468,6 @@ void lex_start(THD *thd)
   lex->context_analysis_only= 0;
   lex->derived_tables= 0;
   lex->safe_to_cache_query= 1;
-  lex->leaf_tables_insert= 0;
   lex->parsing_options.reset();
   lex->empty_field_list_on_rset= 0;
   lex->select_lex.select_number= 1;
@@ -1036,45 +1037,48 @@ int lex_one_token(void *arg, void *yythd)
 	yylval->lex_str.length=2;
 	return NULL_SYM;
       }
+      /* Fall through */
     case MY_LEX_CHAR:			// Unknown or single char token
     case MY_LEX_SKIP:			// This should not happen
-      if (c == '-' && lip->yyPeek() == '-' &&
+      if (c != ')')
+	lip->next_state= MY_LEX_START;	// Allow signed numbers
+      return((int) c);
+
+    case MY_LEX_MINUS_OR_COMMENT:
+      if (lip->yyPeek() == '-' &&
           (my_isspace(cs,lip->yyPeekn(1)) ||
            my_iscntrl(cs,lip->yyPeekn(1))))
       {
         state=MY_LEX_COMMENT;
         break;
       }
+      lip->next_state= MY_LEX_START;	// Allow signed numbers
+      return((int) c);
 
-      if (c != ')')
-	lip->next_state= MY_LEX_START;	// Allow signed numbers
-
-      if (c == ',')
-      {
-        /*
-          Warning:
-          This is a work around, to make the "remember_name" rule in
-          sql/sql_yacc.yy work properly.
-          The problem is that, when parsing "select expr1, expr2",
-          the code generated by bison executes the *pre* action
-          remember_name (see select_item) *before* actually parsing the
-          first token of expr2.
-        */
-        lip->restart_token();
-      }
-      else
-      {
-        /*
-          Check for a placeholder: it should not precede a possible identifier
-          because of binlogging: when a placeholder is replaced with
-          its value in a query for the binlog, the query must stay
-          grammatically correct.
-        */
-        if (c == '?' && lip->stmt_prepare_mode &&
-            !ident_map[(uchar) lip->yyPeek()])
+    case MY_LEX_PLACEHOLDER:
+      /*
+        Check for a placeholder: it should not precede a possible identifier
+        because of binlogging: when a placeholder is replaced with
+        its value in a query for the binlog, the query must stay
+        grammatically correct.
+      */
+      lip->next_state= MY_LEX_START;	// Allow signed numbers
+      if (lip->stmt_prepare_mode && !ident_map[(uchar) lip->yyPeek()])
         return(PARAM_MARKER);
-      }
+      return((int) c);
 
+    case MY_LEX_COMMA:
+      lip->next_state= MY_LEX_START;	// Allow signed numbers
+      /*
+        Warning:
+        This is a work around, to make the "remember_name" rule in
+        sql/sql_yacc.yy work properly.
+        The problem is that, when parsing "select expr1, expr2",
+        the code generated by bison executes the *pre* action
+        remember_name (see select_item) *before* actually parsing the
+        first token of expr2.
+      */
+      lip->restart_token();
       return((int) c);
 
     case MY_LEX_IDENT_OR_NCHAR:
@@ -1809,6 +1813,7 @@ void st_select_lex_unit::init_query()
   describe= 0;
   found_rows_for_union= 0;
   insert_table_with_stored_vcol= 0;
+  derived= 0;
 }
 
 void st_select_lex::init_query()
@@ -1817,7 +1822,9 @@ void st_select_lex::init_query()
   table_list.empty();
   top_join_list.empty();
   join_list= &top_join_list;
-  embedding= leaf_tables= 0;
+  embedding= 0;
+  leaf_tables_prep.empty();
+  leaf_tables.empty();
   item_list.empty();
   join= 0;
   having= prep_having= where= prep_where= 0;
@@ -1849,6 +1856,7 @@ void st_select_lex::init_query()
   exclude_from_table_unique_test= no_wrap_view_item= FALSE;
   nest_level= 0;
   link_next= 0;
+  is_prep_leaf_list_saved= FALSE;
 
   bzero((char*) expr_cache_may_be_used, sizeof(expr_cache_may_be_used));
 }
@@ -1857,6 +1865,7 @@ void st_select_lex::init_select()
 {
   st_select_lex_node::init_select();
   sj_nests.empty();
+  sj_subselects.empty();
   group_list.empty();
   type= db= 0;
   having= 0;
@@ -1883,6 +1892,8 @@ void st_select_lex::init_select()
   cond_value= having_value= Item::COND_UNDEF;
   inner_refs_list.empty();
   full_group_by_flag= 0;
+  insert_tables= 0;
+  merged_into= 0;
 }
 
 /*
@@ -1900,6 +1911,31 @@ void st_select_lex_node::include_down(st_select_lex_node *upper)
   slave= 0;
 }
 
+
+void st_select_lex_node::add_slave(st_select_lex_node *slave_arg)
+{
+  for (; slave; slave= slave->next)
+    if (slave == slave_arg)
+      return;
+
+  if (slave)
+  {
+    st_select_lex_node *slave_arg_slave= slave_arg->slave;
+    /* Insert in the front of list of slaves if any. */
+    slave_arg->include_neighbour(slave);
+    /* include_neighbour() sets slave_arg->slave=0, restore it. */
+    slave_arg->slave= slave_arg_slave;
+    /* Count on include_neighbour() setting the master. */
+    DBUG_ASSERT(slave_arg->master == this);
+  }
+  else
+  {
+    slave= slave_arg;
+    slave_arg->master= this;
+  }
+}
+
+
 /*
   include on level down (but do not link)
 
@@ -1951,17 +1987,29 @@ void st_select_lex_node::fast_exclude()
   
 }
 
+
+/*
+  Exclude a node from the tree lex structure, but leave it in the global
+  list of nodes.
+*/
+
+void st_select_lex_node::exclude_from_tree()
+{
+  if ((*prev= next))
+    next->prev= prev;
+}
+
+
 /*
-  excluding select_lex structure (except first (first select can't be
+  Exclude select_lex structure (except first (first select can't be
   deleted, because it is most upper select))
 */
 void st_select_lex_node::exclude()
 {
-  //exclude from global list
+  /* exclude from global list */
   fast_exclude();
-  //exclude from other structures
-  if ((*prev= next))
-    next->prev= prev;
+  /* exclude from other structures */
+  exclude_from_tree();
   /* 
      We do not need following statements, because prev pointer of first 
      list element point to master->slave
@@ -2048,55 +2096,6 @@ void st_select_lex_unit::exclude_tree()
 }
 
 
-/**
-  Register reference to an item which the subqueries depends on
-
-  @param def_sel         select against which the item is resolved
-  @param dependency      reference to the item
-
-  @details
-  This function puts the reference dependency to an item that is either an
-  outer field or an aggregate function resolved against an outer select into
-  the list 'depends_on'. It adds it to the 'depends_on' lists for each
-  subquery between this one and 'def_sel' - the subquery against which the
-  item is resolved.
-*/
-
-void st_select_lex::register_dependency_item(st_select_lex *def_sel,
-                                             Item **dependency)
-{
-  SELECT_LEX *s= this;
-  DBUG_ENTER("st_select_lex::register_dependency_item");
-  DBUG_ASSERT(this != def_sel);
-  DBUG_ASSERT(*dependency);
-  do
-  {
-    /* check duplicates */
-    List_iterator_fast<Item*> li(s->master_unit()->item->depends_on);
-    Item **dep;
-    while ((dep= li++))
-    {
-      if ((*dep)->eq(*dependency, FALSE))
-      {
-         DBUG_PRINT("info", ("dependency %s already present",
-                             ((*dependency)->name ?
-                              (*dependency)->name :
-                              "<no name>")));
-         DBUG_VOID_RETURN;
-      }
-    }
-
-    s->master_unit()->item->depends_on.push_back(dependency);
-    DBUG_PRINT("info", ("depends_on: Select: %d  added: %s",
-                        s->select_number,
-                        ((*dependency)->name ?
-                         (*dependency)->name :
-                         "<no name>")));
-  } while ((s= s->outer_select()) != def_sel);
-  DBUG_VOID_RETURN;
-}
-
-
 /*
   st_select_lex_node::mark_as_dependent mark all st_select_lex struct from 
   this to 'last' as dependent
@@ -2121,18 +2120,19 @@ bool st_select_lex::mark_as_dependent(THD *thd, st_select_lex *last, Item *depen
   SELECT_LEX *s= this;
   do
   {
-    if (!(s->uncacheable & UNCACHEABLE_DEPENDENT))
+    if (!(s->uncacheable & UNCACHEABLE_DEPENDENT_GENERATED))
     {
       // Select is dependent of outer select
       s->uncacheable= (s->uncacheable & ~UNCACHEABLE_UNITED) |
-                       UNCACHEABLE_DEPENDENT;
+                       UNCACHEABLE_DEPENDENT_GENERATED;
       SELECT_LEX_UNIT *munit= s->master_unit();
       munit->uncacheable= (munit->uncacheable & ~UNCACHEABLE_UNITED) |
-                       UNCACHEABLE_DEPENDENT;
+                       UNCACHEABLE_DEPENDENT_GENERATED;
       for (SELECT_LEX *sl= munit->first_select(); sl ; sl= sl->next_select())
       {
         if (sl != s &&
-            !(sl->uncacheable & (UNCACHEABLE_DEPENDENT | UNCACHEABLE_UNITED)))
+            !(sl->uncacheable & (UNCACHEABLE_DEPENDENT_GENERATED |
+                                 UNCACHEABLE_UNITED)))
           sl->uncacheable|= UNCACHEABLE_UNITED;
       }
     }
@@ -2270,20 +2270,22 @@ ulong st_select_lex::get_table_join_options()
 
 bool st_select_lex::setup_ref_array(THD *thd, uint order_group_num)
 {
+  DBUG_ENTER("st_select_lex::setup_ref_array");
+
   if (ref_pointer_array)
-    return 0;
+    DBUG_RETURN(0);
 
   /*
-    We have to create array in prepared statement memory if it is
+    We have to create array in prepared statement memory if it is a
     prepared statement
   */
-  Query_arena *arena= thd->stmt_arena;
-  return (ref_pointer_array=
-          (Item **)arena->alloc(sizeof(Item*) * (n_child_sum_items +
-                                                 item_list.elements +
-                                                 select_n_having_items +
-                                                 select_n_where_fields +
-                                                 order_group_num)*5)) == 0;
+  ref_pointer_array=
+    (Item **)thd->stmt_arena->alloc(sizeof(Item*) * (n_child_sum_items +
+                                                     item_list.elements +
+                                                     select_n_having_items +
+                                                     select_n_where_fields +
+                                                     order_group_num)*5);
+  DBUG_RETURN(ref_pointer_array == 0);
 }
 
 
@@ -2328,9 +2330,27 @@ void st_select_lex::print_order(String *str,
   {
     if (order->counter_used)
     {
-      char buffer[20];
-      size_t length= my_snprintf(buffer, 20, "%d", order->counter);
-      str->append(buffer, (uint) length);
+      if (query_type != QT_VIEW_INTERNAL)
+      {
+        char buffer[20];
+        size_t length= my_snprintf(buffer, 20, "%d", order->counter);
+        str->append(buffer, (uint) length);
+      }
+      else
+      {
+        /* replace numeric reference with expression */
+        if (order->item[0]->type() == Item::INT_ITEM &&
+            order->item[0]->basic_const_item())
+        {
+          char buffer[20];
+          size_t length= my_snprintf(buffer, 20, "%d", order->counter);
+          str->append(buffer, (uint) length);
+          /* make it expression instead of integer constant */
+          str->append(STRING_WITH_LEN("+0"));
+        }
+        else
+          (*order->item)->print(str, query_type);
+      }
     }
     else
       (*order->item)->print(str, query_type);
@@ -2356,17 +2376,6 @@ void st_select_lex::print_limit(THD *thd,
         subs_type == Item_subselect::IN_SUBS ||
         subs_type == Item_subselect::ALL_SUBS)
     {
-      DBUG_ASSERT(!item->fixed ||
-                  /*
-                    If not using materialization both:
-                    select_limit == 1, and there should be no offset_limit.
-                  */
-                  (((subs_type == Item_subselect::IN_SUBS) &&
-                    ((Item_in_subselect*)item)->exec_method ==
-                    Item_in_subselect::MATERIALIZATION) ?
-                   TRUE :
-                   (select_limit->val_int() == 1LL) &&
-                   offset_limit == 0));
       return;
     }
   }
@@ -3179,10 +3188,14 @@ static void fix_prepare_info_in_table_list(THD *thd, TABLE_LIST *tbl)
   {
     if (tbl->on_expr)
     {
-      tbl->prep_on_expr= tbl->on_expr;
+      thd->check_and_register_item_tree(&tbl->prep_on_expr, &tbl->on_expr);
       tbl->on_expr= tbl->on_expr->copy_andor_structure(thd);
     }
-    fix_prepare_info_in_table_list(thd, tbl->merge_underlying_list);
+    if (tbl->is_view_or_derived() && tbl->is_merged_derived())
+    {
+      SELECT_LEX *sel= tbl->get_single_select();
+      fix_prepare_info_in_table_list(thd, sel->get_table_list());
+    }
   }
 }
 
@@ -3213,12 +3226,12 @@ void st_select_lex::fix_prepare_information(THD *thd, Item **conds,
     first_execution= 0;
     if (*conds)
     {
-      prep_where= *conds;
+      thd->check_and_register_item_tree(&prep_where, conds);
       *conds= where= prep_where->copy_andor_structure(thd);
     }
     if (*having_conds)
     {
-      prep_having= *having_conds;
+      thd->check_and_register_item_tree(&prep_having, having_conds);
       *having_conds= having= prep_having->copy_andor_structure(thd);
     }
     fix_prepare_info_in_table_list(thd, table_list.first);
@@ -3297,6 +3310,545 @@ bool st_select_lex::add_index_hint (THD *thd, char *str, uint length)
                                             str, length));
 }
 
+
+bool st_select_lex::optimize_unflattened_subqueries()
+{
+  for (SELECT_LEX_UNIT *un= first_inner_unit(); un; un= un->next_unit())
+  {
+    Item_subselect *subquery_predicate= un->item;
+    
+    if (subquery_predicate)
+    {
+      if (subquery_predicate->substype() == Item_subselect::IN_SUBS)
+      {
+        Item_in_subselect *in_subs=(Item_in_subselect*)subquery_predicate;
+        if (in_subs->is_jtbm_merged)
+          continue;
+      }
+
+      for (SELECT_LEX *sl= un->first_select(); sl; sl= sl->next_select())
+      {
+        JOIN *inner_join= sl->join;
+        if (!inner_join)
+          continue;
+        SELECT_LEX *save_select= un->thd->lex->current_select;
+        ulonglong save_options;
+        int res;
+        /* We need only 1 row to determine existence */
+        un->set_limit(un->global_parameters);
+        un->thd->lex->current_select= sl;
+        save_options= inner_join->select_options;
+        if (options & SELECT_DESCRIBE)
+        {
+          /* Optimize the subquery in the context of EXPLAIN. */
+          sl->set_explain_type();
+          sl->options|= SELECT_DESCRIBE;
+          inner_join->select_options|= SELECT_DESCRIBE;
+        }
+        res= inner_join->optimize();
+        inner_join->select_options= save_options;
+        un->thd->lex->current_select= save_select;
+        if (res)
+          return TRUE;
+      }
+    }
+  }
+  return FALSE;
+}
+
+
+
+/**
+  @brief Process all derived tables/views of the SELECT.
+
+  @param lex    LEX of this thread
+  @param phase  phases to run derived tables/views through
+
+  @details
+  This function runs specified 'phases' on all tables from the
+  table_list of this select.
+
+  @return FALSE ok.
+  @return TRUE an error occur.
+*/
+
+bool st_select_lex::handle_derived(LEX *lex, uint phases)
+{
+  for (TABLE_LIST *cursor= (TABLE_LIST*) table_list.first;
+       cursor;
+       cursor= cursor->next_local)
+  {
+    if (cursor->is_view_or_derived() && cursor->handle_derived(lex, phases))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/**
+  @brief
+  Returns first unoccupied table map and table number
+
+  @param map     [out] return found map
+  @param tablenr [out] return found tablenr
+
+  @details
+  Returns first unoccupied table map and table number in this select.
+  Map and table are returned in *'map' and *'tablenr' accordingly.
+
+  @retrun TRUE  no free table map/table number
+  @return FALSE found free table map/table number
+*/
+
+bool st_select_lex::get_free_table_map(table_map *map, uint *tablenr)
+{
+  *map= 0;
+  *tablenr= 0;
+  TABLE_LIST *tl;
+  List_iterator<TABLE_LIST> ti(leaf_tables);
+  while ((tl= ti++))
+  {
+    if (tl->table->map > *map)
+      *map= tl->table->map;
+    if (tl->table->tablenr > *tablenr)
+      *tablenr= tl->table->tablenr;
+  }
+  (*map)<<= 1;
+  (*tablenr)++;
+  if (*tablenr >= MAX_TABLES)
+    return TRUE;
+  return FALSE;
+}
+
+
+/**
+  @brief
+  Append given table to the leaf_tables list.
+
+  @param link  Offset to which list in table structure to use
+  @param table Table to append
+
+  @details
+  Append given 'table' to the leaf_tables list using the 'link' offset.
+  If the 'table' is linked with other tables through next_leaf/next_local
+  chains then whole list will be appended.
+*/
+
+void st_select_lex::append_table_to_list(TABLE_LIST *TABLE_LIST::*link,
+                                         TABLE_LIST *table)
+{
+  TABLE_LIST *tl;
+  for (tl= leaf_tables.head(); tl->*link; tl= tl->*link) ;
+  tl->*link= table;
+}
+
+/*
+  @brief
+  Remove given table from the leaf_tables list.
+
+  @param link  Offset to which list in table structure to use
+  @param table Table to remove
+
+  @details
+  Remove 'table' from the leaf_tables list using the 'link' offset.
+*/
+
+void st_select_lex::remove_table_from_list(TABLE_LIST *table)
+{
+  TABLE_LIST *tl;
+  List_iterator<TABLE_LIST> ti(leaf_tables);
+  while ((tl= ti++))
+  {
+    if (tl == table)
+    {
+      ti.remove();
+      break;
+    }
+  }
+}
+
+
+/**
+  @brief
+  Assigns new table maps to tables in the leaf_tables list
+
+  @param derived    Derived table to take initial table map from
+  @param map        table map to begin with
+  @param tablenr    table number to begin with
+  @param parent_lex new parent select_lex
+
+  @details
+  Assign new table maps/table numbers to all tables in the leaf_tables list.
+  'map'/'tablenr' are used for the first table and shifted to left/
+  increased for each consequent table in the leaf_tables list.
+  If the 'derived' table is given then it's table map/number is used for the
+  first table in the list and 'map'/'tablenr' are used for the second and
+  all consequent tables.
+  The 'parent_lex' is set as the new parent select_lex for all tables in the
+  list.
+*/
+
+void st_select_lex::remap_tables(TABLE_LIST *derived, table_map map,
+                                 uint tablenr, SELECT_LEX *parent_lex)
+{
+  bool first_table= TRUE;
+  TABLE_LIST *tl;
+  table_map first_map;
+  uint first_tablenr;
+
+  if (derived && derived->table)
+  {
+    first_map= derived->table->map;
+    first_tablenr= derived->table->tablenr;
+  }
+  else
+  {
+    first_map= map;
+    map<<= 1;
+    first_tablenr= tablenr++;
+  }
+  /*
+    Assign table bit/table number.
+    To the first table of the subselect the table bit/tablenr of the
+    derived table is assigned. The rest of tables are getting bits
+    sequentially, starting from the provided table map/tablenr.
+  */
+  List_iterator<TABLE_LIST> ti(leaf_tables);
+  while ((tl= ti++))
+  {
+    if (first_table)
+    {
+      first_table= FALSE;
+      tl->table->set_table_map(first_map, first_tablenr);
+    }
+    else
+    {
+      tl->table->set_table_map(map, tablenr);
+      tablenr++;
+      map<<= 1;
+    }
+    SELECT_LEX *old_sl= tl->select_lex;
+    tl->select_lex= parent_lex;
+    for(TABLE_LIST *emb= tl->embedding;
+        emb && emb->select_lex == old_sl;
+        emb= emb->embedding)
+      emb->select_lex= parent_lex;
+  }
+}
+
+/**
+  @brief
+  Merge a subquery into this select.
+
+  @param derived     derived table of the subquery to be merged
+  @param subq_select select_lex of the subquery
+  @param map         table map for assigning to merged tables from subquery
+  @param table_no    table number for assigning to merged tables from subquery
+
+  @details
+  This function merges a subquery into its parent select. In short the
+  merge operation appends the subquery FROM table list to the parent's
+  FROM table list. In more details:
+    .) the top_join_list of the subquery is wrapped into a join_nest
+       and attached to 'derived'
+    .) subquery's leaf_tables list  is merged with the leaf_tables
+       list of this select_lex
+    .) the table maps and table numbers of the tables merged from
+       the subquery are adjusted to reflect their new binding to
+       this select
+
+  @return TRUE  an error occur
+  @return FALSE ok
+*/
+
+bool SELECT_LEX::merge_subquery(THD *thd, TABLE_LIST *derived,
+                                SELECT_LEX *subq_select,
+                                uint table_no, table_map map)
+{
+  derived->wrap_into_nested_join(subq_select->top_join_list);
+  /* Reconnect the next_leaf chain. */
+  leaf_tables.concat(&subq_select->leaf_tables);
+
+  ftfunc_list->concat(subq_select->ftfunc_list);
+  if (join ||
+      thd->lex->sql_command == SQLCOM_UPDATE_MULTI ||
+      thd->lex->sql_command == SQLCOM_DELETE_MULTI)
+  {
+    List_iterator_fast<Item_in_subselect> li(subq_select->sj_subselects);
+    Item_in_subselect *in_subq;
+    while ((in_subq= li++))
+    {
+      sj_subselects.push_back(in_subq);
+      if (in_subq->emb_on_expr_nest == NO_JOIN_NEST)
+         in_subq->emb_on_expr_nest= derived;
+    }
+  }
+  /*
+    Remove merged table from chain.
+    When merge_subquery is called at a subquery-to-semijoin transformation
+    the derived isn't in the leaf_tables list, so in this case the call of
+    remove_table_from_list does not cause any actions.
+  */
+  remove_table_from_list(derived);
+
+  /* Walk through child's tables and adjust table map, tablenr,
+   * parent_lex */
+  subq_select->remap_tables(derived, map, table_no, this);
+  subq_select->merged_into= this;
+  return FALSE;
+}
+
+
+/**
+  @brief
+  Mark tables from the leaf_tables list as belong to a derived table.
+
+  @param derived   tables will be marked as belonging to this derived
+
+  @details
+  Run through the leaf_list and mark all tables as belonging to the 'derived'.
+*/
+
+void SELECT_LEX::mark_as_belong_to_derived(TABLE_LIST *derived)
+{
+  /* Mark tables as belonging to this DT */
+  TABLE_LIST *tl;
+  List_iterator<TABLE_LIST> ti(leaf_tables);
+  while ((tl= ti++))
+  {
+    tl->open_type= OT_BASE_ONLY;
+    tl->belong_to_derived= derived;
+  }
+}
+
+
+/**
+  @brief
+  Update used_tables cache for this select
+
+  @details
+  This function updates used_tables cache of ON expressions of all tables
+  in the leaf_tables list and of the conds expression (if any).
+*/
+
+void SELECT_LEX::update_used_tables()
+{
+  TABLE_LIST *tl;
+  List_iterator<TABLE_LIST> ti(leaf_tables);
+  while ((tl= ti++))
+  {
+    TABLE_LIST *embedding;
+    embedding= tl;
+    do
+    {
+      bool maybe_null;
+      if ((maybe_null= test(embedding->outer_join)))
+      {
+	tl->table->maybe_null= maybe_null;
+        break;
+      }
+    }
+    while ((embedding= embedding->embedding));
+    if (tl->on_expr)
+    {
+      tl->on_expr->update_used_tables();
+      tl->on_expr->walk(&Item::eval_not_null_tables, 0, NULL);
+    }
+    embedding= tl->embedding;
+    while (embedding)
+    {
+      if (embedding->on_expr && 
+          embedding->nested_join->join_list.head() == tl)
+      {
+        embedding->on_expr->update_used_tables();
+        embedding->on_expr->walk(&Item::eval_not_null_tables, 0, NULL);
+      }
+      tl= embedding;
+      embedding= tl->embedding;
+    }
+  }
+  if (join->conds)
+  {
+    join->conds->update_used_tables();
+    join->conds->walk(&Item::eval_not_null_tables, 0, NULL);
+  }
+  if (join->having)
+  {
+    join->having->update_used_tables();
+  }
+
+  Item *item;
+  List_iterator_fast<Item> it(join->fields_list);
+  while ((item= it++))
+  {
+    item->update_used_tables();
+  }
+  Item_outer_ref *ref;
+  List_iterator_fast<Item_outer_ref> ref_it(inner_refs_list);
+  while ((ref= ref_it++))
+  {
+    item= ref->outer_ref;
+    item->update_used_tables();
+  }
+  for (ORDER *order= group_list.first; order; order= order->next)
+    (*order->item)->update_used_tables();
+  if (!master_unit()->is_union())
+  {
+    for (ORDER *order= order_list.first; order; order= order->next)
+      (*order->item)->update_used_tables();
+  }      
+}
+
+
+/**
+  Set the EXPLAIN type for this subquery.
+*/
+
+void st_select_lex::set_explain_type()
+{
+  bool is_primary= FALSE;
+  if (next_select())
+    is_primary= TRUE;
+
+  if (!is_primary && first_inner_unit())
+  {
+    /*
+      If there is at least one materialized derived|view then it's a PRIMARY select.
+      Otherwise, all derived tables/views were merged and this select is a SIMPLE one.
+    */
+    for (SELECT_LEX_UNIT *un= first_inner_unit(); un; un= un->next_unit())
+    {
+      if ((!un->derived || un->derived->is_materialized_derived()))
+      {
+        is_primary= TRUE;
+        break;
+      }
+    }
+  }
+
+  SELECT_LEX *first= master_unit()->first_select();
+  /* drop UNCACHEABLE_EXPLAIN, because it is for internal usage only */
+  uint8 is_uncacheable= (uncacheable & ~UNCACHEABLE_EXPLAIN);
+
+  type= ((&master_unit()->thd->lex->select_lex == this) ?
+         (is_primary ? "PRIMARY" : "SIMPLE"):    
+         ((this == first) ?
+          ((linkage == DERIVED_TABLE_TYPE) ?
+           "DERIVED" :
+           ((is_uncacheable & UNCACHEABLE_DEPENDENT) ?
+            "DEPENDENT SUBQUERY" :
+            (is_uncacheable ? "UNCACHEABLE SUBQUERY" :
+             "SUBQUERY"))) :
+          ((is_uncacheable & UNCACHEABLE_DEPENDENT) ?
+           "DEPENDENT UNION":
+           is_uncacheable ? "UNCACHEABLE UNION":
+           "UNION")));
+  options|= SELECT_DESCRIBE;
+}
+
+
+/**
+  @brief
+  Increase estimated number of records for a derived table/view
+
+  @param records  number of records to increase estimate by
+
+  @details
+  This function increases estimated number of records by the 'records'
+  for the derived table to which this select belongs to.
+*/
+
+void SELECT_LEX::increase_derived_records(ha_rows records)
+{
+  SELECT_LEX_UNIT *unit= master_unit();
+  DBUG_ASSERT(unit->derived);
+
+  select_union *result= (select_union*)unit->result;
+  result->records+= records;
+}
+
+
+/**
+  @brief
+  Mark select's derived table as a const one.
+
+  @param empty Whether select has an empty result set
+
+  @details
+  Mark derived table/view of this select as a constant one (to
+  materialize it at the optimization phase) unless this select belongs to a
+  union. Estimated number of rows is incremented if this select has non empty
+  result set.
+*/
+
+void SELECT_LEX::mark_const_derived(bool empty)
+{
+  TABLE_LIST *derived= master_unit()->derived;
+  if (!join->thd->lex->describe && derived)
+  {
+    if (!empty)
+      increase_derived_records(1);
+    if (!master_unit()->is_union() && !derived->is_merged_derived())
+      derived->fill_me= TRUE;
+  }
+}
+
+
+bool st_select_lex::save_leaf_tables(THD *thd)
+{
+  Query_arena *arena= thd->stmt_arena, backup;
+  if (arena->is_conventional())
+    arena= 0;                                  
+  else
+    thd->set_n_backup_active_arena(arena, &backup);
+
+  List_iterator_fast<TABLE_LIST> li(leaf_tables);
+  TABLE_LIST *table;
+  while ((table= li++))
+  {
+    if (leaf_tables_exec.push_back(table))
+      return 1;
+    table->tablenr_exec= table->table->tablenr;
+    table->map_exec= table->table->map;
+    if (join && (join->select_options & SELECT_DESCRIBE))
+      table->maybe_null_exec= 0;
+    else
+      table->maybe_null_exec= table->table->maybe_null;
+  }
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+
+  return 0;
+}
+
+
+bool st_select_lex::save_prep_leaf_tables(THD *thd)
+{
+  if (!thd->save_prep_leaf_list)
+    return 0;
+
+  Query_arena *arena= thd->stmt_arena, backup;
+  if (arena->is_conventional())
+    arena= 0;                                  
+  else
+    thd->set_n_backup_active_arena(arena, &backup);
+
+  List_iterator_fast<TABLE_LIST> li(leaf_tables);
+  TABLE_LIST *table;
+  while ((table= li++))
+  {
+    if (leaf_tables_prep.push_back(table))
+      return 1;
+  }
+  thd->lex->select_lex.is_prep_leaf_list_saved= TRUE; 
+  thd->save_prep_leaf_list= FALSE;
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+
+  return 0;
+}
+
+
 /**
   A routine used by the parser to decide whether we are specifying a full
   partitioning or if only partitions to add or to split.
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 480fcf87a15..1c72586e5f7 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -515,7 +515,8 @@ public:
 
   /*
     result of this query can't be cached, bit field, can be :
-      UNCACHEABLE_DEPENDENT
+      UNCACHEABLE_DEPENDENT_GENERATED
+      UNCACHEABLE_DEPENDENT_INJECTED
       UNCACHEABLE_RAND
       UNCACHEABLE_SIDEEFFECT
       UNCACHEABLE_EXPLAIN
@@ -549,10 +550,12 @@ public:
   virtual void init_query();
   virtual void init_select();
   void include_down(st_select_lex_node *upper);
+  void add_slave(st_select_lex_node *slave_arg);
   void include_neighbour(st_select_lex_node *before);
   void include_standalone(st_select_lex_node *sel, st_select_lex_node **ref);
   void include_global(st_select_lex_node **plink);
   void exclude();
+  void exclude_from_tree();
 
   virtual st_select_lex_unit* master_unit()= 0;
   virtual st_select_lex* outer_select()= 0;
@@ -577,6 +580,11 @@ public:
   friend bool mysql_new_select(LEX *lex, bool move_down);
   friend bool mysql_make_view(THD *thd, File_parser *parser,
                               TABLE_LIST *table, uint flags);
+  friend bool mysql_derived_prepare(THD *thd, LEX *lex,
+                                  TABLE_LIST *orig_table_list);
+  friend bool mysql_derived_merge(THD *thd, LEX *lex,
+                                  TABLE_LIST *orig_table_list);
+  friend bool TABLE_LIST::init_derived(THD *thd, bool init_view);
 private:
   void fast_exclude();
 };
@@ -597,9 +605,6 @@ class st_select_lex_unit: public st_select_lex_node {
 protected:
   TABLE_LIST result_table_list;
   select_union *union_result;
-  TABLE *table; /* temporary table using for appending UNION results */
-
-  select_result *result;
   ulonglong found_rows_for_union;
   bool saved_error;
 
@@ -612,6 +617,9 @@ public:
   {
   }
 
+
+  TABLE *table; /* temporary table using for appending UNION results */
+  select_result *result;
   bool  prepared, // prepare phase already performed for UNION (unit)
     optimized, // optimize phase already performed for UNION (unit)
     executed, // already executed
@@ -638,6 +646,11 @@ public:
   ha_rows select_limit_cnt, offset_limit_cnt;
   /* not NULL if unit used in subselect, point to subselect item */
   Item_subselect *item;
+  /*
+    TABLE_LIST representing this union in the embedding select. Used for
+    derived tables/views handling.
+  */
+  TABLE_LIST *derived;
   /* thread handler */
   THD *thd;
   /*
@@ -674,6 +687,7 @@ public:
 
   /* UNION methods */
   bool prepare(THD *thd, select_result *result, ulong additional_options);
+  bool optimize();
   bool exec();
   bool cleanup();
   inline void unclean() { cleaned= 0; }
@@ -690,6 +704,8 @@ public:
   void set_thd(THD *thd_arg) { thd= thd_arg; }
   inline bool is_union (); 
 
+  void set_unique_exclude();
+
   friend void lex_start(THD *thd);
   friend int subselect_union_engine::exec();
 
@@ -735,8 +751,25 @@ public:
     Beginning of the list of leaves in a FROM clause, where the leaves
     inlcude all base tables including view tables. The tables are connected
     by TABLE_LIST::next_leaf, so leaf_tables points to the left-most leaf.
+
+    List of all base tables local to a subquery including all view
+    tables. Unlike 'next_local', this in this list views are *not*
+    leaves. Created in setup_tables() -> make_leaves_list().
   */
-  TABLE_LIST *leaf_tables;
+  /* 
+    Subqueries that will need to be converted to semi-join nests, including
+    those converted to jtbm nests. The list is emptied when conversion is done.
+  */
+  List<Item_in_subselect> sj_subselects;
+
+  List<TABLE_LIST> leaf_tables;
+  List<TABLE_LIST> leaf_tables_exec;
+  List<TABLE_LIST> leaf_tables_prep;
+  bool is_prep_leaf_list_saved;
+  uint insert_tables;
+  st_select_lex *merged_into; /* select which this select is merged into */
+                              /* (not 0 only for views/derived tables)   */
+
   const char *type;               /* type of select for EXPLAIN          */
 
   SQL_I_List<ORDER> order_list;   /* ORDER clause */
@@ -866,7 +899,6 @@ public:
   inline bool is_subquery_function() { return master_unit()->item != 0; }
 
   bool mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency);
-  void register_dependency_item(st_select_lex *last, Item **dependency);
 
   bool set_braces(bool value);
   bool inc_in_sum_expr();
@@ -952,6 +984,37 @@ public:
 
   void clear_index_hints(void) { index_hints= NULL; }
   bool is_part_of_union() { return master_unit()->is_union(); }
+  /*
+    Optimize all subqueries that have not been flattened into semi-joins.
+    This functionality is a method of SELECT_LEX instead of JOIN because
+    some SQL statements as DELETE do not have a corresponding JOIN object.
+  */
+  bool optimize_unflattened_subqueries();
+  /* Set the EXPLAIN type for this subquery. */
+  void set_explain_type();
+  bool handle_derived(LEX *lex, uint phases);
+  void append_table_to_list(TABLE_LIST *TABLE_LIST::*link, TABLE_LIST *table);
+  bool get_free_table_map(table_map *map, uint *tablenr);
+  void remove_table_from_list(TABLE_LIST *table);
+  void remap_tables(TABLE_LIST *derived, table_map map,
+                    uint tablenr, st_select_lex *parent_lex);
+  bool merge_subquery(THD *thd, TABLE_LIST *derived, st_select_lex *subq_lex,
+                      uint tablenr, table_map map);
+  inline bool is_mergeable()
+  {
+    return (next_select() == 0 && group_list.elements == 0 &&
+            having == 0 && with_sum_func == 0 &&
+            table_list.elements >= 1 && !(options & SELECT_DISTINCT) &&
+            select_limit == 0);
+  }
+  void mark_as_belong_to_derived(TABLE_LIST *derived);
+  void increase_derived_records(ha_rows records);
+  void update_used_tables();
+  void mark_const_derived(bool empty);
+
+  bool save_leaf_tables(THD *thd);
+  bool save_prep_leaf_tables(THD *thd);
+
 private:  
   /* current index hint kind. used in filling up index_hints */
   enum index_hint_type current_index_hint_type;
@@ -2205,8 +2268,6 @@ struct LEX: public Query_tables_list
 
   CHARSET_INFO *charset;
   bool text_string_is_7bit;
-  /* store original leaf_tables for INSERT SELECT and PS/SP */
-  TABLE_LIST *leaf_tables_insert;
 
   /** SELECT of CREATE VIEW statement */
   LEX_STRING create_view_select;
@@ -2326,7 +2387,7 @@ struct LEX: public Query_tables_list
     DERIVED_SUBQUERY and DERIVED_VIEW).
   */
   uint8 derived_tables;
-  uint8 create_view_algorithm;
+  uint16 create_view_algorithm;
   uint8 create_view_check;
   uint8 context_analysis_only;
   bool drop_if_exists, drop_temporary, local_file, one_shot_set;
@@ -2335,7 +2396,7 @@ struct LEX: public Query_tables_list
 
   enum enum_yes_no_unknown tx_chain, tx_release;
   bool safe_to_cache_query;
-  bool subqueries, ignore;
+  bool subqueries, ignore, online;
   st_parsing_options parsing_options;
   Alter_info alter_info;
   /*
@@ -2447,9 +2508,15 @@ struct LEX: public Query_tables_list
   {
     return (context_analysis_only &
             (CONTEXT_ANALYSIS_ONLY_PREPARE |
+             CONTEXT_ANALYSIS_ONLY_VCOL_EXPR |
              CONTEXT_ANALYSIS_ONLY_VIEW));
   }
 
+  inline bool is_view_context_analysis()
+  {
+    return (context_analysis_only & CONTEXT_ANALYSIS_ONLY_VIEW);
+  }
+
   inline void uncacheable(uint8 cause)
   {
     safe_to_cache_query= 0;
@@ -2496,6 +2563,8 @@ struct LEX: public Query_tables_list
     switch (sql_command) {
     case SQLCOM_UPDATE:
     case SQLCOM_UPDATE_MULTI:
+    case SQLCOM_DELETE:
+    case SQLCOM_DELETE_MULTI:
     case SQLCOM_INSERT:
     case SQLCOM_INSERT_SELECT:
     case SQLCOM_REPLACE:
diff --git a/sql/sql_lifo_buffer.h b/sql/sql_lifo_buffer.h
new file mode 100644
index 00000000000..34f9624436d
--- /dev/null
+++ b/sql/sql_lifo_buffer.h
@@ -0,0 +1,342 @@
+/**
+  @defgroup Bi-directional LIFO buffers used by DS-MRR implementation
+  @{
+*/
+
+class Forward_lifo_buffer;
+class Backward_lifo_buffer;
+
+
+/*
+  A base class for in-memory buffer used by DS-MRR implementation. Common
+  properties:
+  - The buffer is last-in-first-out, i.e. elements that are written last are
+    read first.
+  - The buffer contains fixed-size elements. The elements are either atomic
+    byte sequences or pairs of them.
+  - The buffer resides in the memory provided by the user. It is possible to
+     = dynamically (ie. between write operations) add ajacent memory space to
+       the buffer
+     = dynamically remove unused space from the buffer.
+    The intent of this is to allow to have two buffers on adjacent memory
+    space, one is being read from (and so its space shrinks), while the other 
+    is being written to (and so it needs more and more space).
+
+  There are two concrete classes, Forward_lifo_buffer and Backward_lifo_buffer.
+*/
+
+class Lifo_buffer 
+{
+protected:
+  size_t size1;
+  size_t size2;
+
+public:
+  /**
+    write() will put into buffer size1 bytes pointed by write_ptr1. If
+    size2!=0, then they will be accompanied by size2 bytes pointed by
+    write_ptr2.
+  */
+  uchar *write_ptr1;
+  uchar *write_ptr2;
+
+  /**
+    read() will do reading by storing pointers to read data into read_ptr1 or
+    into (read_ptr1, read_ptr2), depending on whether the buffer was set to
+    store single objects or pairs.
+  */
+  uchar *read_ptr1;
+  uchar *read_ptr2;
+
+protected:
+  uchar *start; /**< points to start of buffer space */
+  uchar *end;   /**< points to just beyond the end of buffer space */
+public:
+
+  enum enum_direction {
+    BACKWARD=-1, /**< buffer is filled/read from bigger to smaller memory addresses */
+    FORWARD=1  /**< buffer is filled/read from smaller to bigger memory addresses */
+  };
+
+  virtual enum_direction type() = 0;
+
+  /* Buffer space control functions */
+
+  /** Let the buffer store data in the given space. */
+  void set_buffer_space(uchar *start_arg, uchar *end_arg) 
+  {
+    start= start_arg;
+    end= end_arg;
+    TRASH(start, end - start);
+    reset();
+  }
+  
+  /** 
+    Specify where write() should get the source data from, as well as source
+    data size.
+  */
+  void setup_writing(size_t len1, size_t len2)
+  {
+    size1= len1;
+    size2= len2;
+  }
+
+  /** 
+    Specify where read() should store pointers to read data, as well as read
+    data size. The sizes must match those passed to setup_writing().
+  */
+  void setup_reading(size_t len1, size_t len2)
+  {
+    DBUG_ASSERT(len1 == size1);
+    DBUG_ASSERT(len2 == size2);
+  }
+  
+  bool can_write()
+  {
+    return have_space_for(size1 + size2);
+  }
+  virtual void write() = 0;
+
+  bool is_empty() { return used_size() == 0; }
+  virtual bool read() = 0;
+  
+  void sort(qsort2_cmp cmp_func, void *cmp_func_arg)
+  {
+    size_t elem_size= size1 + size2;
+    size_t n_elements= used_size() / elem_size;
+    my_qsort2(used_area(), n_elements, elem_size, cmp_func, cmp_func_arg);
+  }
+
+  virtual void reset() = 0;
+  virtual uchar *end_of_space() = 0;
+protected:
+  virtual size_t used_size() = 0;
+  
+  /* To be used only by iterator class: */
+  virtual uchar *get_pos()= 0;
+  virtual bool read(uchar **position, uchar **ptr1, uchar **ptr2)= 0;
+  friend class Lifo_buffer_iterator;
+public:
+  virtual bool have_space_for(size_t bytes) = 0;
+
+  virtual void remove_unused_space(uchar **unused_start, uchar **unused_end)=0;
+  virtual uchar *used_area() = 0; 
+  virtual ~Lifo_buffer() {};
+};
+
+
+/**
+  Forward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to just beyond the end of used space.
+
+  It is possible to grow/shink the buffer at the end bound
+
+     used space      unused space  
+   *==============*-----------------*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+
+class Forward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return FORWARD; }
+  size_t used_size()
+  {
+    return (size_t)(pos - start);
+  }
+  void reset()
+  {
+    pos= start;
+  }
+  uchar *end_of_space() { return pos; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos + bytes < end);
+  }
+
+  void write()
+  {
+    write_bytes(write_ptr1, size1);
+    if (size2)
+      write_bytes(write_ptr2, size2);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    memcpy(pos, data, bytes);
+    pos += bytes;
+  }
+  bool have_data(uchar *position, size_t bytes)
+  {
+    return ((position - start) >= (ptrdiff_t)bytes);
+  }
+  uchar *read_bytes(uchar **position, size_t bytes)
+  {
+    DBUG_ASSERT(have_data(*position, bytes));
+    *position= (*position) - bytes;
+    return *position;
+  }
+  bool read() { return read(&pos, &read_ptr1, &read_ptr2); }
+  bool read(uchar **position, uchar **ptr1, uchar **ptr2)
+  {
+    if (!have_data(*position, size1 + size2))
+      return TRUE;
+    if (size2)
+      *ptr2= read_bytes(position, size2);
+    *ptr1= read_bytes(position, size1);
+    return FALSE;
+  }
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    DBUG_ASSERT(0); /* Don't need this yet */
+  }
+  /**
+    Add more space to the buffer. The caller is responsible that the space
+    being added is adjacent to the end of the buffer.
+
+    @param unused_start Start of space
+    @param unused_end   End of space
+  */
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(unused_end >= unused_start);
+    DBUG_ASSERT(end == unused_start);
+    TRASH(unused_start, unused_end - unused_start);
+    end= unused_end;
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return start; }
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
+};
+
+
+
+/**
+  Backward LIFO buffer
+
+  The buffer that is being written to from start to end and read in the
+  reverse.  'pos' points to the start of used space.
+
+  It is possible to grow/shink the buffer at the start.
+
+     unused space      used space  
+   *--------------*=================*
+   ^              ^                 ^
+   |              |                 +--- end
+   |              +---- pos              
+   +--- start           
+*/
+class Backward_lifo_buffer: public Lifo_buffer
+{
+  uchar *pos;
+public:
+  enum_direction type() { return BACKWARD; }
+ 
+  size_t used_size()
+  {
+    return (size_t)(end - pos);
+  }
+  void reset()
+  {
+    pos= end;
+  }
+  uchar *end_of_space() { return end; }
+  bool have_space_for(size_t bytes)
+  {
+    return (pos - bytes >= start);
+  }
+  void write()
+  {
+    if (write_ptr2)
+      write_bytes(write_ptr2, size2);
+    write_bytes(write_ptr1, size1);
+  }
+  void write_bytes(const uchar *data, size_t bytes)
+  {
+    DBUG_ASSERT(have_space_for(bytes));
+    pos -= bytes;
+    memcpy(pos, data, bytes);
+  }
+  bool read()
+  {
+    return read(&pos, &read_ptr1, &read_ptr2);
+  }
+  bool read(uchar **position, uchar **ptr1, uchar **ptr2)
+  {
+    if (!have_data(*position, size1 + size2))
+      return TRUE;
+    *ptr1= read_bytes(position, size1);
+    if (size2)
+      *ptr2= read_bytes(position, size2);
+    return FALSE;
+  }
+  bool have_data(uchar *position, size_t bytes)
+  {
+    return ((end - position) >= (ptrdiff_t)bytes);
+  }
+  uchar *read_bytes(uchar **position, size_t bytes)
+  {
+    DBUG_ASSERT(have_data(*position, bytes));
+    uchar *ret= *position;
+    *position= *position + bytes;
+    return ret;
+  }
+  /**
+    Stop using/return the unused part of the space
+    @param unused_start  OUT Start of the unused space
+    @param unused_end    OUT End of the unused space
+  */
+  void remove_unused_space(uchar **unused_start, uchar **unused_end)
+  {
+    *unused_start= start;
+    *unused_end= pos;
+    start= pos;
+  }
+  void grow(uchar *unused_start, uchar *unused_end)
+  {
+    DBUG_ASSERT(0); /* Not used for backward buffers */
+  }
+  /* Return pointer to start of the memory area that is occupied by the data */
+  uchar *used_area() { return pos; }
+  friend class Lifo_buffer_iterator;
+  uchar *get_pos() { return pos; }
+};
+
+
+/** Iterator to walk over contents of the buffer without reading from it */
+class Lifo_buffer_iterator
+{
+  uchar *pos;
+  Lifo_buffer *buf;
+  
+public:
+  /* The data is read to here */
+  uchar *read_ptr1;
+  uchar *read_ptr2;
+
+  void init(Lifo_buffer *buf_arg)
+  {
+    buf= buf_arg;
+    pos= buf->get_pos();
+  }
+  /*
+    Read the next value. The calling convention is the same as buf->read()
+    has.
+
+    @retval FALSE - ok
+    @retval TRUE  - EOF, reached the end of the buffer
+  */
+  bool read() 
+  {
+    return buf->read(&pos, &read_ptr1, &read_ptr2);
+  }
+};
+
+
diff --git a/sql/sql_list.h b/sql/sql_list.h
index cf19cf82607..46e9923c51a 100644
--- a/sql/sql_list.h
+++ b/sql/sql_list.h
@@ -156,6 +156,7 @@ struct list_node :public Sql_alloc
   }
 };
 
+typedef bool List_eq(void *a, void *b);
 
 extern MYSQL_PLUGIN_IMPORT list_node end_of_list;
 
@@ -239,6 +240,11 @@ public:
   {
     if (!list->is_empty())
     {
+      if (is_empty())
+      {
+        *this= *list;
+        return;
+      }
       *last= list->first;
       last= list->last;
       elements+= list->elements;
@@ -259,11 +265,13 @@ public:
     list_node *node= first;
     list_node *list_first= list->first;
     elements=0;
-    while (node && node != list_first)
+    while (node != &end_of_list && node != list_first)
     {
       prev= &node->next;
       node= node->next;
       elements++;
+      if (node == &end_of_list)
+        return;
     }
     *prev= *last;
     last= prev;
@@ -292,6 +300,16 @@ public:
   inline void **head_ref() { return first != &end_of_list ? &first->info : 0; }
   inline bool is_empty() { return first == &end_of_list ; }
   inline list_node *last_ref() { return &end_of_list; }
+  inline bool add_unique(void *info, List_eq *eq)
+  {
+    list_node *node= first;
+    for (;
+         node != &end_of_list && (!(*eq)(node->info, info));
+         node= node->next) ;
+    if (node == &end_of_list)
+      return push_back(info);
+    return 1;
+  }
   friend class base_list_iterator;
   friend class error_list;
   friend class error_list_iterator;
@@ -462,6 +480,8 @@ public:
   inline void concat(List<T> *list) { base_list::concat(list); }
   inline void disjoin(List<T> *list) { base_list::disjoin(list); }
   inline void prepand(List<T> *list) { base_list::prepand(list); }
+  inline bool add_unique(T *a, bool (*eq)(T *a, T *b))
+  { return base_list::add_unique(a, (List_eq *)eq); }
   void delete_elements(void)
   {
     list_node *element,*next;
@@ -514,36 +534,40 @@ public:
 
 
 /*
-  Exchange sort algorithm for List<T>.
+  Bubble sort algorithm for List<T>.
+  This sort function is supposed to be used only for very short list.
+  Currently it is used for the lists of Item_equal objects and
+  for some lists in the table elimination algorithms. In both
+  cases the sorted lists are very short.
 */
+
 template <class T> 
-inline void exchange_sort(List<T> *list_to_sort,
-                          int (*sort_func)(T *a, T *b, void *arg), void *arg)
+inline void bubble_sort(List<T> *list_to_sort,
+                        int (*sort_func)(T *a, T *b, void *arg), void *arg)
 {
   bool swap;
+  T **ref1= 0;
+  T **ref2= 0;
   List_iterator<T> it(*list_to_sort);
   do
   {
+    T **last_ref= ref1;
     T *item1= it++;
-    T **ref1= it.ref();
+    ref1= it.ref();
     T *item2;
 
     swap= FALSE;
-    while ((item2= it++))
+    while ((item2= it++) && (ref2= it.ref()) != last_ref)
     {
-      T **ref2= it.ref();
       if (sort_func(item1, item2, arg) < 0)
       {
-        T *item= *ref1;
-        *ref1= *ref2;
-        *ref2= item;
+        *ref1= item2;
+        *ref2= item1;
         swap= TRUE;
       }
       else
-      {
         item1= item2;
-        ref1= ref2;
-      }
+      ref1= ref2;
     }
     it.rewind();
   } while (swap);
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index 44c2339f462..5acddc19bed 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -37,6 +37,7 @@
 #include "sql_repl.h"
 #include "sp_head.h"
 #include "sql_trigger.h"
+#include "sql_derived.h"
 
 class XML_TAG {
 public:
@@ -102,6 +103,8 @@ public:
     ::end_io_cache(&cache);
     need_end_io_cache = 0;
   }
+  my_off_t file_length() { return cache.end_of_file; }
+  my_off_t position()    { return my_b_tell(&cache); }
 
   /*
     Either this method, or we need to make cache public
@@ -220,12 +223,15 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
 
   if (open_and_lock_tables(thd, table_list, TRUE, 0))
     DBUG_RETURN(TRUE);
+  if (mysql_handle_single_derived(thd->lex, table_list, DT_MERGE_FOR_INSERT) ||
+      mysql_handle_single_derived(thd->lex, table_list, DT_PREPARE))
+    DBUG_RETURN(TRUE);
   if (setup_tables_and_check_access(thd, &thd->lex->select_lex.context,
                                     &thd->lex->select_lex.top_join_list,
                                     table_list,
-                                    &thd->lex->select_lex.leaf_tables, FALSE,
+                                    thd->lex->select_lex.leaf_tables, FALSE,
                                     INSERT_ACL | UPDATE_ACL,
-                                    INSERT_ACL | UPDATE_ACL))
+                                    INSERT_ACL | UPDATE_ACL, FALSE))
      DBUG_RETURN(-1);
   if (!table_list->table ||               // do not suport join view
       !table_list->updatable ||           // and derived tables
@@ -462,9 +468,9 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
     }
   }
 
+  thd_proc_info(thd, "reading file");
   if (!(error=test(read_info.error)))
   {
-
     table->next_number_field=table->found_next_number_field;
     if (ignore ||
 	handle_duplicates == DUP_REPLACE)
@@ -482,6 +488,7 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
                              (MODE_STRICT_TRANS_TABLES |
                               MODE_STRICT_ALL_TABLES)));
 
+    thd_progress_init(thd, 2);
     if (ex->filetype == FILETYPE_XML) /* load xml */
       error= read_xml_field(thd, info, table_list, fields_vars,
                             set_fields, set_values, read_info,
@@ -494,6 +501,9 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
       error= read_sep_field(thd, info, table_list, fields_vars,
                             set_fields, set_values, read_info,
 			    *enclosed, skip_lines, ignore);
+
+    thd_proc_info(thd, "End bulk insert");
+    thd_progress_next_stage(thd);
     if (thd->locked_tables_mode <= LTM_LOCK_TABLES &&
         table->file->ha_end_bulk_insert() && !error)
     {
@@ -670,11 +680,15 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
   size_t               pl= 0;
   List<Item>           fv;
   Item                *item, *val;
-  String               pfield, pfields;
   int                  n;
   const char          *tbl= table_name_arg;
   const char          *tdb= (thd->db != NULL ? thd->db : db_arg);
-  String              string_buf;
+  char 		      name_buffer[SAFE_NAME_LEN*2];
+  char                command_buffer[1024];
+  String              string_buf(name_buffer, sizeof(name_buffer),
+                                 system_charset_info);
+  String              pfields(command_buffer, sizeof(command_buffer),
+                              system_charset_info);
 
   if (!thd->db || strcmp(db_arg, thd->db)) 
   {
@@ -683,7 +697,7 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
       prefix table name with database name so that it 
       becomes a FQ name.
      */
-    string_buf.set_charset(system_charset_info);
+    string_buf.length(0);
     string_buf.append(db_arg);
     string_buf.append("`");
     string_buf.append(".");
@@ -704,6 +718,7 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
   /*
     prepare fields-list and SET if needed; print_query won't do that for us.
   */
+  pfields.length(0);
   if (!thd->lex->field_list.is_empty())
   {
     List_iterator<Item>  li(thd->lex->field_list);
@@ -747,8 +762,8 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
     }
   }
 
-  p= pfields.c_ptr_safe();
-  pl= strlen(p);
+  p=  pfields.c_ptr_safe();
+  pl= pfields.length();
 
   if (!(load_data_query= (char *)thd->alloc(lle.get_query_buffer_length() + 1 + pl)))
     return TRUE;
@@ -785,9 +800,16 @@ read_fixed_length(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
   List_iterator_fast<Item> it(fields_vars);
   Item_field *sql_field;
   TABLE *table= table_list->table;
-  bool err;
+  bool err, progress_reports;
+  ulonglong counter, time_to_report_progress;
   DBUG_ENTER("read_fixed_length");
 
+  counter= 0;
+  time_to_report_progress= MY_HOW_OFTEN_TO_WRITE/10;
+  progress_reports= 1;
+  if ((thd->progress.max_counter= read_info.file_length()) == ~(my_off_t) 0)
+    progress_reports= 0;
+
   while (!read_info.read_fixed_length())
   {
     if (thd->killed)
@@ -795,6 +817,16 @@ read_fixed_length(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
       thd->send_kill_message();
       DBUG_RETURN(1);
     }
+    if (progress_reports)
+    {
+      thd->progress.counter= read_info.position();
+      if (++counter >= time_to_report_progress)
+      {
+        time_to_report_progress+= MY_HOW_OFTEN_TO_WRITE/10;
+        thd_progress_report(thd, thd->progress.counter,
+                            thd->progress.max_counter);
+      }
+    }
     if (skip_lines)
     {
       /*
@@ -916,11 +948,18 @@ read_sep_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
   Item *item;
   TABLE *table= table_list->table;
   uint enclosed_length;
-  bool err;
+  bool err, progress_reports;
+  ulonglong counter, time_to_report_progress;
   DBUG_ENTER("read_sep_field");
 
   enclosed_length=enclosed.length();
 
+  counter= 0;
+  time_to_report_progress= MY_HOW_OFTEN_TO_WRITE/10;
+  progress_reports= 1;
+  if ((thd->progress.max_counter= read_info.file_length()) == ~(my_off_t) 0)
+    progress_reports= 0;
+
   for (;;it.rewind())
   {
     if (thd->killed)
@@ -929,6 +968,16 @@ read_sep_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
       DBUG_RETURN(1);
     }
 
+    if (progress_reports)
+    {
+      thd->progress.counter= read_info.position();
+      if (++counter >= time_to_report_progress)
+      {
+        time_to_report_progress+= MY_HOW_OFTEN_TO_WRITE/10;
+        thd_progress_report(thd, thd->progress.counter,
+                            thd->progress.max_counter);
+      }
+    }
     restore_record(table, s->default_values);
 
     while ((item= it++))
@@ -1038,7 +1087,7 @@ read_sep_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
           if (!field->maybe_null() && field->type() == FIELD_TYPE_TIMESTAMP)
               ((Field_timestamp*) field)->set_time();
           /*
-            QQ: We probably should not throw warning for each field.
+            TODO: We probably should not throw warning for each field.
             But how about intention to always have the same number
             of warnings in THD::cuted_fields (and get rid of cuted_fields
             in the end ?)
@@ -1332,7 +1381,6 @@ READ_INFO::READ_INFO(File file_par, uint tot_length, CHARSET_INFO *cs,
   field_term_char= field_term_length ? (uchar) field_term_ptr[0] : INT_MAX;
   line_term_char= line_term_length ? (uchar) line_term_ptr[0] : INT_MAX;
 
-
   /* Set of a stack for unget if long terminators */
   uint length= max(cs->mbmaxlen, max(field_term_length, line_term_length)) + 1;
   set_if_bigger(length,line_start.length());
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 73f96fc3fc5..a952c596a8b 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
    Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
@@ -264,22 +264,22 @@ void init_update_queries(void)
     the code, in particular in the Query_log_event's constructor.
   */
   sql_command_flags[SQLCOM_CREATE_TABLE]=   CF_CHANGES_DATA | CF_REEXECUTION_FRAGILE |
-                                            CF_AUTO_COMMIT_TRANS |
+                                            CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS |
                                             CF_CAN_GENERATE_ROW_EVENTS;
   sql_command_flags[SQLCOM_CREATE_INDEX]=   CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_ALTER_TABLE]=    CF_CHANGES_DATA | CF_WRITE_LOGS_COMMAND |
-                                            CF_AUTO_COMMIT_TRANS;
+                                            CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS ;
   sql_command_flags[SQLCOM_TRUNCATE]=       CF_CHANGES_DATA | CF_WRITE_LOGS_COMMAND |
                                             CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_DROP_TABLE]=     CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_LOAD]=           CF_CHANGES_DATA | CF_REEXECUTION_FRAGILE |
-                                            CF_CAN_GENERATE_ROW_EVENTS;
+                                            CF_CAN_GENERATE_ROW_EVENTS | CF_REPORT_PROGRESS;
   sql_command_flags[SQLCOM_CREATE_DB]=      CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_DROP_DB]=        CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_ALTER_DB_UPGRADE]= CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_ALTER_DB]=       CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_RENAME_TABLE]=   CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
-  sql_command_flags[SQLCOM_DROP_INDEX]=     CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
+  sql_command_flags[SQLCOM_DROP_INDEX]=     CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS;
   sql_command_flags[SQLCOM_CREATE_VIEW]=    CF_CHANGES_DATA | CF_REEXECUTION_FRAGILE |
                                             CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_DROP_VIEW]=      CF_CHANGES_DATA | CF_AUTO_COMMIT_TRANS;
@@ -389,10 +389,11 @@ void init_update_queries(void)
     The following admin table operations are allowed
     on log tables.
   */
-  sql_command_flags[SQLCOM_REPAIR]=    CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS;
-  sql_command_flags[SQLCOM_OPTIMIZE]|= CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS;
-  sql_command_flags[SQLCOM_ANALYZE]=   CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS;
-  sql_command_flags[SQLCOM_CHECK]=     CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS;
+  sql_command_flags[SQLCOM_REPAIR]=    CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS;
+  sql_command_flags[SQLCOM_OPTIMIZE]|= CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS;
+  sql_command_flags[SQLCOM_ANALYZE]=   CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS;
+  sql_command_flags[SQLCOM_CHECK]=     CF_WRITE_LOGS_COMMAND | CF_AUTO_COMMIT_TRANS | CF_REPORT_PROGRESS;
+  sql_command_flags[SQLCOM_CHECKSUM]=  CF_REPORT_PROGRESS;
 
   sql_command_flags[SQLCOM_CREATE_USER]|=       CF_AUTO_COMMIT_TRANS;
   sql_command_flags[SQLCOM_DROP_USER]|=         CF_AUTO_COMMIT_TRANS;
@@ -895,6 +896,10 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
                       &thd->security_ctx->priv_user[0],
                       (char *) thd->security_ctx->host_or_ip);
   
+  DBUG_EXECUTE_IF("crash_dispatch_command_before",
+                  { DBUG_PRINT("crash_dispatch_command_before", ("now"));
+                    DBUG_ABORT(); });
+
   thd->command=command;
   /*
     Commands which always take a long time are logged into
@@ -904,18 +909,6 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
   thd->query_plan_flags= QPLAN_INIT;
   thd->lex->sql_command= SQLCOM_END; /* to avoid confusing VIEW detectors */
   thd->set_time();
-  if (!thd->is_valid_time())
-  {
-    /*
-     If the time has got past 2038 we need to shut this server down
-     We do this by making sure every command is a shutdown and we 
-     have enough privileges to shut the server down
-
-     TODO: remove this when we have full 64 bit my_time_t support
-    */
-    thd->security_ctx->master_access|= SHUTDOWN_ACL;
-    command= COM_SHUTDOWN;
-  }
   thd->set_query_id(get_query_id());
   if (!(server_command_flags[command] & CF_SKIP_QUERY_ID))
     next_query_id();
@@ -973,14 +966,19 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
     CHARSET_INFO *save_character_set_results=
       thd->variables.character_set_results;
 
+    /* Ensure we don't free security_ctx->user in case we have to revert */
+    thd->security_ctx->user= 0;
+    thd->user_connect= 0;
+
     rc= acl_authenticate(thd, 0, packet_length);
     MYSQL_AUDIT_NOTIFY_CONNECTION_CHANGE_USER(thd);
     if (rc)
     {
-      /* authentication can fail before or after allocating new username */
-      if (thd->security_ctx->user != save_security_ctx.user)
-        my_free(thd->security_ctx->user);
+      /* Free user if allocated by acl_authenticate */
+      my_free(thd->security_ctx->user);
       *thd->security_ctx= save_security_ctx;
+      if (thd->user_connect)
+	decrease_user_connections(thd->user_connect);
       thd->user_connect= save_user_connect;
       thd->reset_db(save_db, save_db_length);
       thd->variables.character_set_client= save_character_set_client;
@@ -1289,10 +1287,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
       packet[0].
     */
     enum mysql_enum_shutdown_level level;
-    if (!thd->is_valid_time())
-      level= SHUTDOWN_DEFAULT;
-    else
-      level= (enum mysql_enum_shutdown_level) (uchar) packet[0];
+    level= (enum mysql_enum_shutdown_level) (uchar) packet[0];
     if (level == SHUTDOWN_DEFAULT)
       level= SHUTDOWN_WAIT_ALL_BUFFERS; // soon default will be configurable
     else if (level != SHUTDOWN_WAIT_ALL_BUFFERS)
@@ -1424,6 +1419,8 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
                       thd->stmt_da->is_error() ? thd->stmt_da->sql_errno() : 0,
                       command_name[command].str);
 
+  thd->update_all_stats();
+
   log_slow_statement(thd);
 
   thd_proc_info(thd, "cleaning up");
@@ -1455,8 +1452,6 @@ void log_slow_statement(THD *thd)
 {
   DBUG_ENTER("log_slow_statement");
 
-  thd->update_all_stats();
-
   /*
     The following should never be true with our current code base,
     but better to keep this here so we don't accidently try to log a
@@ -2059,6 +2054,8 @@ mysql_execute_command(THD *thd)
 #endif
 
   status_var_increment(thd->status_var.com_stat[lex->sql_command]);
+  thd->progress.report_to_client= test(sql_command_flags[lex->sql_command] &
+                                       CF_REPORT_PROGRESS);
 
   DBUG_ASSERT(thd->transaction.stmt.modified_non_trans_table == FALSE);
 
@@ -2199,11 +2196,7 @@ case SQLCOM_PREPARE:
       goto error;
     }
     it= new Item_func_unix_timestamp(it);
-    /*
-      it is OK only emulate fix_fieds, because we need only
-      value of constant
-    */
-    it->quick_fix_field();
+    it->fix_fields(thd, &it);
     res = purge_master_logs_before_date(thd, (ulong)it->val_int());
     break;
   }
@@ -2576,7 +2569,7 @@ end_with_restore_list:
 
     res= mysql_alter_table(thd, first_table->db, first_table->table_name,
                            &create_info, first_table, &alter_info,
-                           0, (ORDER*) 0, 0);
+                           0, (ORDER*) 0, 0, 0);
     break;
   }
 #ifdef HAVE_REPLICATION
@@ -2863,12 +2856,17 @@ end_with_restore_list:
 
     DBUG_EXECUTE_IF("after_mysql_insert",
                     {
-                      const char act[]=
+                      const char act1[]=
                         "now "
                         "wait_for signal.continue";
+                      const char act2[]=
+                        "now "
+                        "signal signal.continued";
                       DBUG_ASSERT(opt_debug_sync_timeout > 0);
-                      DBUG_ASSERT(!debug_sync_set_action(current_thd,
-                                                         STRING_WITH_LEN(act)));
+                      DBUG_ASSERT(!debug_sync_set_action(thd,
+                                                         STRING_WITH_LEN(act1)));
+                      DBUG_ASSERT(!debug_sync_set_action(thd,
+                                                         STRING_WITH_LEN(act2)));
                     };);
     break;
   }
@@ -2892,6 +2890,10 @@ end_with_restore_list:
     if (!(res= open_and_lock_tables(thd, all_tables, TRUE, 0)))
     {
       MYSQL_INSERT_SELECT_START(thd->query());
+      /*
+        Only the INSERT table should be merged. Other will be handled by
+        select.
+      */
       /* Skip first table, which is the table we are inserting in */
       TABLE_LIST *second_table= first_table->next_local;
       select_lex->table_list.first= second_table;
@@ -3169,7 +3171,7 @@ end_with_restore_list:
     {
 #ifdef HAVE_QUERY_CACHE
       if (thd->variables.query_cache_wlock_invalidate)
-        query_cache.invalidate_locked_for_write(first_table);
+	query_cache.invalidate_locked_for_write(thd, first_table);
 #endif /*HAVE_QUERY_CACHE*/
       my_ok(thd);
     }
@@ -4495,9 +4497,8 @@ static bool execute_sqlcom_select(THD *thd, TABLE_LIST *all_tables)
           mysqld_show_warnings().
         */
         thd->lex->unit.print(&str, QT_TO_SYSTEM_CHARSET);
-        str.append('\0');
         push_warning(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
-                     ER_YES, str.ptr());
+                     ER_YES, str.c_ptr_safe());
       }
       if (res)
         result->abort_result_set();
@@ -5326,6 +5327,7 @@ void THD::reset_for_next_command(bool calculate_userstat)
   thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt= 0;
 
   thd->query_start_used= 0;
+  thd->query_start_sec_part_used= 0;
   thd->is_fatal_error= thd->time_zone_used= 0;
   /*
     Clear the status flag that are expected to be cleared at the
@@ -6348,6 +6350,28 @@ push_new_name_resolution_context(THD *thd,
 
 
 /**
+  Fix condition which contains only field (f turns to  f <> 0 )
+
+  @param cond            The condition to fix
+
+  @return fixed condition
+*/
+
+Item *normalize_cond(Item *cond)
+{
+  if (cond)
+  {
+    Item::Type type= cond->type();
+    if (type == Item::FIELD_ITEM || type == Item::REF_ITEM)
+    {
+      cond= new Item_func_ne(cond, new Item_int(0));
+    }
+  }
+  return cond;
+}
+
+
+/**
   Add an ON condition to the second operand of a JOIN ... ON.
 
     Add an ON condition to the right operand of a JOIN ... ON clause.
@@ -6365,6 +6389,7 @@ void add_join_on(TABLE_LIST *b, Item *expr)
 {
   if (expr)
   {
+    expr= normalize_cond(expr);
     if (!b->on_expr)
       b->on_expr= expr;
     else
diff --git a/sql/sql_parse.h b/sql/sql_parse.h
index 650588c5cac..d5f90b03ebf 100644
--- a/sql/sql_parse.h
+++ b/sql/sql_parse.h
@@ -126,6 +126,7 @@ bool push_new_name_resolution_context(THD *thd,
 void store_position_for_column(const char *name);
 void init_update_queries(void);
 bool check_simple_select();
+Item *normalize_cond(Item *cond);
 Item *negate_expression(THD *thd, Item *expr);
 bool check_stack_overrun(THD *thd, long margin, uchar *dummy);
 
diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc
index de4ee14b3c1..c75b20e577e 100644
--- a/sql/sql_partition.cc
+++ b/sql/sql_partition.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2005, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -2039,6 +2040,9 @@ static int add_partition_options(File fptr, partition_element *p_elem)
   }
   if (p_elem->part_comment)
     err+= add_keyword_string(fptr, "COMMENT", TRUE, p_elem->part_comment);
+  if (p_elem->connect_string.length)
+    err+= add_keyword_string(fptr, "CONNECTION", TRUE,
+                             p_elem->connect_string.str);
   return err + add_engine(fptr,p_elem->engine_type);
 }
 
@@ -6279,7 +6283,7 @@ static int alter_close_tables(ALTER_PARTITION_PARAM_TYPE *lpt, bool close_old)
   DBUG_ENTER("alter_close_tables");
   if (lpt->table->db_stat)
   {
-    lpt->table->file->close();
+    lpt->table->file->ha_close();
     lpt->table->db_stat= 0;                        // Mark file closed
   }
   if (close_old && lpt->old_table)
diff --git a/sql/sql_plugin.cc b/sql/sql_plugin.cc
index 5e46a5ebf82..b725f1c35fc 100644
--- a/sql/sql_plugin.cc
+++ b/sql/sql_plugin.cc
@@ -351,7 +351,7 @@ static const char *item_val_str(struct st_mysql_value *value,
     Lets be nice and create a temporary string since the
     buffer was too small
   */
-  return current_thd->strmake(res->c_ptr_quick(), res->length());
+  return current_thd->strmake(res->ptr(), res->length());
 }
 
 
@@ -1512,15 +1512,23 @@ int plugin_init(int *argc, char **argv, int flags)
       if (register_builtin(plugin, &tmp, &plugin_ptr))
         goto err_unlock;
 
-      /* only initialize MyISAM and CSV at this stage */
-      if (!(is_myisam=
-            !my_strcasecmp(&my_charset_latin1, plugin->name, "MyISAM")) &&
-          my_strcasecmp(&my_charset_latin1, plugin->name, "CSV"))
-        continue;
+      is_myisam= !my_strcasecmp(&my_charset_latin1, plugin->name, "MyISAM");
 
-      if (plugin_ptr->state != PLUGIN_IS_UNINITIALIZED ||
-          plugin_initialize(plugin_ptr))
-        goto err_unlock;
+      /*
+        strictly speaking, we should to initialize all plugins,
+        even for mysqld --help, because important subsystems
+        may be disabled otherwise, and the help will be incomplete.
+        For example, if the mysql.plugin table is not MyISAM.
+        But for now it's an unlikely corner case, and to optimize
+        mysqld --help for all other users, we will only initialize
+        MyISAM here.
+      */
+      if (!(flags & PLUGIN_INIT_SKIP_INITIALIZATION) || is_myisam)
+      {
+        if (plugin_ptr->state == PLUGIN_IS_UNINITIALIZED &&
+            plugin_initialize(plugin_ptr))
+          goto err_unlock;
+      }
 
       /*
         initialize the global default storage engine so that it may
@@ -1672,8 +1680,11 @@ static void plugin_load(MEM_ROOT *tmp_root, int *argc, char **argv)
   if (result)
   {
     DBUG_PRINT("error",("Can't open plugin table"));
-    sql_print_error("Can't open the mysql.plugin table. Please "
-                    "run mysql_upgrade to create it.");
+    if (!opt_help)
+      sql_print_error("Can't open the mysql.plugin table. Please "
+                      "run mysql_upgrade to create it.");
+    else
+      sql_print_warning("Could not open mysql.plugin table. Some options may be missing from the help text");
     goto end;
   }
   table= tables.table;
@@ -1684,13 +1695,6 @@ static void plugin_load(MEM_ROOT *tmp_root, int *argc, char **argv)
     goto end;
   }
   table->use_all_columns();
-  /*
-    there're no other threads running yet, so we don't need a mutex.
-    but plugin_add() before is designed to work in multi-threaded
-    environment, and it uses mysql_mutex_assert_owner(), so we lock
-    the mutex here to satisfy the assert
-  */
-  mysql_mutex_lock(&LOCK_plugin);
   while (!(error= read_record_info.read_record(&read_record_info)))
   {
     DBUG_PRINT("info", ("init plugin record"));
@@ -1701,12 +1705,19 @@ static void plugin_load(MEM_ROOT *tmp_root, int *argc, char **argv)
     LEX_STRING name= {(char *)str_name.ptr(), str_name.length()};
     LEX_STRING dl= {(char *)str_dl.ptr(), str_dl.length()};
 
+    /*
+      there're no other threads running yet, so we don't need a mutex.
+      but plugin_add() before is designed to work in multi-threaded
+      environment, and it uses mysql_mutex_assert_owner(), so we lock
+      the mutex here to satisfy the assert
+    */
+    mysql_mutex_lock(&LOCK_plugin);
     if (plugin_add(tmp_root, &name, &dl, argc, argv, REPORT_TO_LOG))
       sql_print_warning("Couldn't load plugin named '%s' with soname '%s'.",
                         str_name.c_ptr(), str_dl.c_ptr());
     free_root(tmp_root, MYF(MY_MARK_BLOCKS_FREE));
+    mysql_mutex_unlock(&LOCK_plugin);
   }
-  mysql_mutex_unlock(&LOCK_plugin);
   if (error > 0)
     sql_print_error(ER(ER_GET_ERRNO), my_errno);
   end_read_record(&read_record_info);
diff --git a/sql/sql_plugin_services.h b/sql/sql_plugin_services.h
index f39e22f1e21..50c579b6c4c 100644
--- a/sql/sql_plugin_services.h
+++ b/sql/sql_plugin_services.h
@@ -46,13 +46,20 @@ static struct my_thread_scheduler_service my_thread_scheduler_handler= {
   my_thread_scheduler_reset,
 };
 
+static struct progress_report_service_st progress_report_handler= {
+  thd_progress_init,
+  thd_progress_report,
+  thd_progress_next_stage,
+  thd_progress_end,
+  set_thd_proc_info
+};
 
 static struct st_service_ref list_of_services[]=
 {
   { "my_snprintf_service", VERSION_my_snprintf, &my_snprintf_handler },
   { "thd_alloc_service",   VERSION_thd_alloc,   &thd_alloc_handler },
   { "thd_wait_service",    VERSION_thd_wait,    &thd_wait_handler },
-  { "my_thread_scheduler_service",
-    VERSION_my_thread_scheduler, &my_thread_scheduler_handler },
+  { "my_thread_scheduler_service", VERSION_my_thread_scheduler, &my_thread_scheduler_handler },
+  { "progress_report_service", VERSION_progress_report, &progress_report_handler }
 };
 
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index da1334ed0bf..7973713cb32 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2002, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2002, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -113,6 +114,7 @@ When one supplies long data for a placeholder:
 #include <mysql_com.h>
 #endif
 #include "lock.h"                               // MYSQL_OPEN_FORCE_SHARED_MDL
+#include "sql_handler.h"
 
 /**
   A result class used to send cursor rows using the binary protocol.
@@ -124,7 +126,7 @@ class Select_fetch_protocol_binary: public select_send
 public:
   Select_fetch_protocol_binary(THD *thd);
   virtual bool send_result_set_metadata(List<Item> &list, uint flags);
-  virtual bool send_data(List<Item> &items);
+  virtual int send_data(List<Item> &items);
   virtual bool send_eof();
 #ifdef EMBEDDED_LIBRARY
   void begin_dataset()
@@ -238,9 +240,9 @@ protected:
   virtual bool store(const char *from, size_t length, CHARSET_INFO *cs);
   virtual bool store(const char *from, size_t length,
                      CHARSET_INFO *fromcs, CHARSET_INFO *tocs);
-  virtual bool store(MYSQL_TIME *time);
+  virtual bool store(MYSQL_TIME *time, int decimals);
   virtual bool store_date(MYSQL_TIME *time);
-  virtual bool store_time(MYSQL_TIME *time);
+  virtual bool store_time(MYSQL_TIME *time, int decimals);
   virtual bool store(float value, uint32 decimals, String *buffer);
   virtual bool store(double value, uint32 decimals, String *buffer);
   virtual bool store(Field *field);
@@ -333,6 +335,8 @@ static bool send_prep_stmt(Prepared_statement *stmt, uint columns)
   int error;
   THD *thd= stmt->thd;
   DBUG_ENTER("send_prep_stmt");
+  DBUG_PRINT("enter",("stmt->id: %lu  columns: %d  param_count: %d",
+                      stmt->id, columns, stmt->param_count));
 
   buff[0]= 0;                                   /* OK packet indicator */
   int4store(buff+1, stmt->id);
@@ -577,8 +581,7 @@ static void set_param_time(Item_param *param, uchar **pos, ulong len)
   }
   else
     set_zero_time(&tm, MYSQL_TIMESTAMP_TIME);
-  param->set_time(&tm, MYSQL_TIMESTAMP_TIME,
-                  MAX_TIME_WIDTH * MY_CHARSET_BIN_MB_MAXLEN);
+  param->set_time(&tm, MYSQL_TIMESTAMP_TIME, MAX_TIME_FULL_WIDTH);
   *pos+= length;
 }
 
@@ -1256,7 +1259,7 @@ static bool mysql_test_insert(Prepared_statement *stmt,
     TL_WRITE_DELAYED as having two such locks can cause table corruption.
   */
   if (open_normal_and_derived_tables(thd, table_list,
-                                     MYSQL_OPEN_FORCE_SHARED_MDL))
+                                     MYSQL_OPEN_FORCE_SHARED_MDL, DT_INIT))
     goto error;
 
   if ((values= its++))
@@ -1340,7 +1343,10 @@ static int mysql_test_update(Prepared_statement *stmt,
       open_tables(thd, &table_list, &table_count, MYSQL_OPEN_FORCE_SHARED_MDL))
     goto error;
 
-  if (table_list->multitable_view)
+  if (mysql_handle_derived(thd->lex, DT_INIT))
+    goto error;
+
+  if (table_list->is_multitable())
   {
     DBUG_ASSERT(table_list->view != 0);
     DBUG_PRINT("info", ("Switch to multi-update"));
@@ -1354,8 +1360,16 @@ static int mysql_test_update(Prepared_statement *stmt,
     thd->fill_derived_tables() is false here for sure (because it is
     preparation of PS, so we even do not check it).
   */
-  if (mysql_handle_derived(thd->lex, &mysql_derived_prepare))
+  if (table_list->handle_derived(thd->lex, DT_MERGE_FOR_INSERT))
+    goto error;
+  if (table_list->handle_derived(thd->lex, DT_PREPARE))
+    goto error;
+
+  if (!table_list->updatable)
+  {
+    my_error(ER_NON_UPDATABLE_TABLE, MYF(0), table_list->alias, "UPDATE");
     goto error;
+  }
 
 #ifndef NO_EMBEDDED_ACCESS_CHECKS
   /* Force privilege re-checking for views after they have been opened. */
@@ -1409,16 +1423,28 @@ error:
 static bool mysql_test_delete(Prepared_statement *stmt,
                               TABLE_LIST *table_list)
 {
+  uint table_count= 0;
   THD *thd= stmt->thd;
   LEX *lex= stmt->lex;
   DBUG_ENTER("mysql_test_delete");
 
   if (delete_precheck(thd, table_list) ||
-      open_normal_and_derived_tables(thd, table_list,
-                                     MYSQL_OPEN_FORCE_SHARED_MDL))
+      open_tables(thd, &table_list, &table_count, MYSQL_OPEN_FORCE_SHARED_MDL))
+    goto error;
+
+  if (mysql_handle_derived(thd->lex, DT_INIT))
+    goto error;
+  if (mysql_handle_derived(thd->lex, DT_MERGE_FOR_INSERT))
+    goto error;
+  if (mysql_handle_derived(thd->lex, DT_PREPARE))
     goto error;
 
-  if (!table_list->table)
+  if (!table_list->updatable)
+  {
+    my_error(ER_NON_UPDATABLE_TABLE, MYF(0), table_list->alias, "DELETE");
+    goto error;
+  }
+  if (!table_list->table || !table_list->table->created)
   {
     my_error(ER_VIEW_DELETE_MERGE_VIEW, MYF(0),
              table_list->view_db.str, table_list->view_name.str);
@@ -1473,7 +1499,8 @@ static int mysql_test_select(Prepared_statement *stmt,
     goto error;
   }
 
-  if (open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL))
+  if (open_normal_and_derived_tables(thd, tables,  MYSQL_OPEN_FORCE_SHARED_MDL,
+                                     DT_PREPARE | DT_CREATE))
     goto error;
 
   thd->used_tables= 0;                        // Updated by setup_fields
@@ -1535,7 +1562,8 @@ static bool mysql_test_do_fields(Prepared_statement *stmt,
                                    UINT_MAX, FALSE))
     DBUG_RETURN(TRUE);
 
-  if (open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL))
+  if (open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL,
+                                     DT_PREPARE | DT_CREATE))
     DBUG_RETURN(TRUE);
   DBUG_RETURN(setup_fields(thd, 0, *values, MARK_COLUMNS_NONE, 0, 0));
 }
@@ -1563,9 +1591,10 @@ static bool mysql_test_set_fields(Prepared_statement *stmt,
   THD *thd= stmt->thd;
   set_var_base *var;
 
-  if ((tables && check_table_access(thd, SELECT_ACL, tables, FALSE,
-                                    UINT_MAX, FALSE)) ||
-      open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL))
+  if ((tables &&
+       check_table_access(thd, SELECT_ACL, tables, FALSE, UINT_MAX, FALSE)) ||
+      open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL,
+                                     DT_PREPARE | DT_CREATE))
     goto error;
 
   while ((var= it++))
@@ -1600,9 +1629,9 @@ static bool mysql_test_call_fields(Prepared_statement *stmt,
   THD *thd= stmt->thd;
   Item *item;
 
-  if ((tables && check_table_access(thd, SELECT_ACL, tables, FALSE,
-                                    UINT_MAX, FALSE)) ||
-      open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL))
+  if ((tables &&
+       check_table_access(thd, SELECT_ACL, tables, FALSE, UINT_MAX, FALSE)) ||
+      open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL, DT_PREPARE))
     goto err;
 
   while ((item= it++))
@@ -1677,6 +1706,7 @@ select_like_stmt_test_with_open(Prepared_statement *stmt,
                                 int (*specific_prepare)(THD *thd),
                                 ulong setup_tables_done_option)
 {
+  uint table_count= 0;
   DBUG_ENTER("select_like_stmt_test_with_open");
 
   /*
@@ -1685,8 +1715,8 @@ select_like_stmt_test_with_open(Prepared_statement *stmt,
     prepared EXPLAIN yet so derived tables will clean up after
     themself.
   */
-  if (open_normal_and_derived_tables(stmt->thd, tables,
-                                     MYSQL_OPEN_FORCE_SHARED_MDL))
+  THD *thd= stmt->thd;
+  if (open_tables(thd, &tables, &table_count, MYSQL_OPEN_FORCE_SHARED_MDL))
     DBUG_RETURN(TRUE);
 
   DBUG_RETURN(select_like_stmt_test(stmt, specific_prepare,
@@ -1727,7 +1757,8 @@ static bool mysql_test_create_table(Prepared_statement *stmt)
       create_table->open_type= OT_BASE_ONLY;
 
     if (open_normal_and_derived_tables(stmt->thd, lex->query_tables,
-                                       MYSQL_OPEN_FORCE_SHARED_MDL))
+                                       MYSQL_OPEN_FORCE_SHARED_MDL,
+                                       DT_PREPARE | DT_CREATE))
       DBUG_RETURN(TRUE);
 
     select_lex->context.resolve_in_select_list= TRUE;
@@ -1747,7 +1778,8 @@ static bool mysql_test_create_table(Prepared_statement *stmt)
       which keeps metadata validation code simple.
     */
     if (open_normal_and_derived_tables(stmt->thd, lex->query_tables,
-                                       MYSQL_OPEN_FORCE_SHARED_MDL))
+                                       MYSQL_OPEN_FORCE_SHARED_MDL,
+                                       DT_PREPARE))
       DBUG_RETURN(TRUE);
   }
 
@@ -1780,7 +1812,8 @@ static bool mysql_test_create_view(Prepared_statement *stmt)
   if (create_view_precheck(thd, tables, view, lex->create_view_mode))
     goto err;
 
-  if (open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL))
+  if (open_normal_and_derived_tables(thd, tables, MYSQL_OPEN_FORCE_SHARED_MDL,
+                                     DT_PREPARE))
     goto err;
 
   lex->context_analysis_only|=  CONTEXT_ANALYSIS_ONLY_VIEW;
@@ -1925,6 +1958,56 @@ static bool mysql_test_insert_select(Prepared_statement *stmt,
   return res;
 }
 
+/**
+  Validate SELECT statement.
+
+    In case of success, if this query is not EXPLAIN, send column list info
+    back to the client.
+
+  @param stmt               prepared statement
+  @param tables             list of tables used in the query
+
+  @retval 0 success
+  @retval 1 error, error message is set in THD
+  @retval 2 success, and statement metadata has been sent
+*/
+
+static int mysql_test_handler_read(Prepared_statement *stmt,
+                                   TABLE_LIST *tables)
+{
+  THD *thd= stmt->thd;
+  LEX *lex= stmt->lex;
+  SQL_HANDLER *ha_table;
+  DBUG_ENTER("mysql_test_select");
+
+  lex->select_lex.context.resolve_in_select_list= TRUE;
+
+  /*
+    We don't have to test for permissions as this is already done during
+    HANDLER OPEN
+  */
+  if (!(ha_table= mysql_ha_read_prepare(thd, tables, lex->ha_read_mode,
+                                        lex->ident.str,
+                                        lex->insert_list,
+                                        lex->select_lex.where)))
+    DBUG_RETURN(1);
+
+  if (!stmt->is_sql_prepare())
+  {
+    if (!lex->result && !(lex->result= new (stmt->mem_root) select_send))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), sizeof(select_send));
+      DBUG_RETURN(1);
+    }
+    if (send_prep_stmt(stmt, ha_table->fields.elements) ||
+        lex->result->send_result_set_metadata(ha_table->fields, Protocol::SEND_EOF) ||
+        thd->protocol->flush())
+      DBUG_RETURN(1);
+    DBUG_RETURN(2);
+  }
+  DBUG_RETURN(0);
+}
+
 
 /**
   Perform semantic analysis of the parsed tree and send a response packet
@@ -2043,6 +2126,11 @@ static bool check_prepared_statement(Prepared_statement *stmt)
     res= mysql_test_insert_select(stmt, tables);
     break;
 
+  case SQLCOM_HA_READ:
+    res= mysql_test_handler_read(stmt, tables);
+    /* Statement and field info has already been sent */
+    DBUG_RETURN(res == 1 ? TRUE : FALSE);
+
     /*
       Note that we don't need to have cases in this list if they are
       marked with CF_STATUS_COMMAND in sql_command_flags
@@ -2427,6 +2515,7 @@ void reinit_stmt_before_use(THD *thd, LEX *lex)
       /* Fix ORDER list */
       for (order= sl->order_list.first; order; order= order->next)
         order->item= &order->item_ptr;
+      sl->handle_derived(lex, DT_REINIT);
 
       /* clear the no_error flag for INSERT/UPDATE IGNORE */
       sl->no_error= FALSE;
@@ -2477,9 +2566,6 @@ void reinit_stmt_before_use(THD *thd, LEX *lex)
   }
   lex->current_select= &lex->select_lex;
 
-  /* restore original list used in INSERT ... SELECT */
-  if (lex->leaf_tables_insert)
-    lex->select_lex.leaf_tables= lex->leaf_tables_insert;
 
   if (lex->result)
   {
@@ -2912,11 +2998,11 @@ bool Select_fetch_protocol_binary::send_eof()
 }
 
 
-bool
+int
 Select_fetch_protocol_binary::send_data(List<Item> &fields)
 {
   Protocol *save_protocol= thd->protocol;
-  bool rc;
+  int rc;
 
   thd->protocol= &protocol;
   rc= select_send::send_data(fields);
@@ -3403,6 +3489,7 @@ Prepared_statement::execute_loop(String *expanded_query,
   Reprepare_observer reprepare_observer;
   bool error;
   int reprepare_attempt= 0;
+  bool need_set_parameters= true;
 
   /* Check if we got an error when sending long data */
   if (state == Query_arena::STMT_ERROR)
@@ -3411,10 +3498,18 @@ Prepared_statement::execute_loop(String *expanded_query,
     return TRUE;
   }
 
-  if (set_parameters(expanded_query, packet, packet_end))
+reexecute:
+  if (need_set_parameters &&
+      set_parameters(expanded_query, packet, packet_end))
     return TRUE;
 
-reexecute:
+  /*
+    if set_parameters() has generated warnings,
+    we need to repeat it when reexecuting, to recreate these
+    warnings.
+  */
+  need_set_parameters= thd->warning_info->statement_warn_count();
+
   reprepare_observer.reset_reprepare_observer();
 
   /*
@@ -4271,8 +4366,10 @@ bool Protocol_local::store(const char *str, size_t length,
 
 /* Store MYSQL_TIME (in binary format) */
 
-bool Protocol_local::store(MYSQL_TIME *time)
+bool Protocol_local::store(MYSQL_TIME *time, int decimals)
 {
+  if (decimals != AUTO_SEC_PART_DIGITS)
+    time->second_part= sec_part_truncate(time->second_part, decimals);
   return store_column(time, sizeof(MYSQL_TIME));
 }
 
@@ -4287,8 +4384,10 @@ bool Protocol_local::store_date(MYSQL_TIME *time)
 
 /** Store MYSQL_TIME (in binary format) */
 
-bool Protocol_local::store_time(MYSQL_TIME *time)
+bool Protocol_local::store_time(MYSQL_TIME *time, int decimals)
 {
+  if (decimals != AUTO_SEC_PART_DIGITS)
+    time->second_part= sec_part_truncate(time->second_part, decimals);
   return store_column(time, sizeof(MYSQL_TIME));
 }
 
diff --git a/sql/sql_priv.h b/sql/sql_priv.h
index a85dea8b273..674366be1b7 100644
--- a/sql/sql_priv.h
+++ b/sql/sql_priv.h
@@ -1,4 +1,5 @@
 /* Copyright 2000-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+   Copyright (c) 2010-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -159,39 +160,54 @@
 #define OPTIMIZER_SWITCH_INDEX_MERGE_UNION         (1ULL << 1)
 #define OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION    (1ULL << 2)
 #define OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT     (1ULL << 3)
-#define OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN (1ULL << 4)
-#define OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN       (1ULL << 5)
-#define OPTIMIZER_SWITCH_FIRSTMATCH                (1ULL << 6)
-#define OPTIMIZER_SWITCH_LOOSE_SCAN                (1ULL << 7)
-#define OPTIMIZER_SWITCH_MATERIALIZATION           (1ULL << 8)
-#define OPTIMIZER_SWITCH_SEMIJOIN                  (1ULL << 9)
-#define OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE (1ULL <<10)
-#define OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN  (1ULL <<11)
-#define OPTIMIZER_SWITCH_SUBQUERY_CACHE            (1ULL <<12)
-#define OPTIMIZER_SWITCH_TABLE_ELIMINATION         (1ULL <<13)
-#define OPTIMIZER_SWITCH_LAST                      (1ULL <<14)
-
-#ifdef DBUG_OFF
-#define DBUG_ONLY_TABLE_ELIMINATION 0
-#else
-#define DBUG_ONLY_TABLE_ELIMINATION OPTIMIZER_SWITCH_TABLE_ELIMINATION
-#endif
+#define OPTIMIZER_SWITCH_INDEX_MERGE_SORT_INTERSECT (1ULL << 4)
+#define OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN (1ULL << 5)
+#define OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN       (1ULL << 6)
+#define OPTIMIZER_SWITCH_DERIVED_MERGE             (1ULL << 7)
+#define OPTIMIZER_SWITCH_DERIVED_WITH_KEYS         (1ULL << 8)
+#define OPTIMIZER_SWITCH_FIRSTMATCH                (1ULL << 9)
+#define OPTIMIZER_SWITCH_LOOSE_SCAN                (1ULL << 10)
+#define OPTIMIZER_SWITCH_MATERIALIZATION           (1ULL << 11)
+#define OPTIMIZER_SWITCH_IN_TO_EXISTS              (1ULL << 12)
+#define OPTIMIZER_SWITCH_SEMIJOIN                  (1ULL << 13)
+#define OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE (1ULL << 14)
+#define OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN  (1ULL << 15)
+#define OPTIMIZER_SWITCH_SUBQUERY_CACHE            (1ULL << 16)
+/** If this is off, MRR is never used. */
+#define OPTIMIZER_SWITCH_MRR                       (1ULL << 17)
+/**
+   If OPTIMIZER_SWITCH_MRR is on and this is on, MRR is used depending on a
+   cost-based choice ("automatic"). If OPTIMIZER_SWITCH_MRR is on and this is
+   off, MRR is "forced" (i.e. used as long as the storage engine is capable of
+   doing it).
+*/
+#define OPTIMIZER_SWITCH_MRR_COST_BASED            (1ULL << 18)
+#define OPTIMIZER_SWITCH_MRR_SORT_KEYS             (1ULL << 19)
+#define OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE     (1ULL << 20)
+#define OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE       (1ULL << 21)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL    (1ULL << 22)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_HASHED         (1ULL << 23)
+#define OPTIMIZER_SWITCH_JOIN_CACHE_BKA            (1ULL << 24)
+#define OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE (1ULL << 25)
+#define OPTIMIZER_SWITCH_TABLE_ELIMINATION         (1ULL << 26)
+#define OPTIMIZER_SWITCH_LAST                      (1ULL << 26)
 
-#  define OPTIMIZER_SWITCH_DEFAULT (OPTIMIZER_SWITCH_INDEX_MERGE | \
+/*
+TODO: Materialization is off by default to mimic 5.1/5.2 behavior.
+Once cost based choice between materialization and in-to-exists should be
+enabled by default, add OPTIMIZER_SWITCH_MATERIALIZATION
+*/
+#define OPTIMIZER_SWITCH_DEFAULT   (OPTIMIZER_SWITCH_INDEX_MERGE | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION | \
                                     OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT | \
-                                    OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN |\
-                                    OPTIMIZER_SWITCH_INDEX_COND_PUSHDOWN | \
-                                    OPTIMIZER_SWITCH_FIRSTMATCH | \
-                                    OPTIMIZER_SWITCH_LOOSE_SCAN | \
-                                    OPTIMIZER_SWITCH_MATERIALIZATION | \
-                                    OPTIMIZER_SWITCH_SEMIJOIN | \
+                                    OPTIMIZER_SWITCH_TABLE_ELIMINATION | \
+                                    OPTIMIZER_SWITCH_IN_TO_EXISTS | \
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_ROWID_MERGE|\
                                     OPTIMIZER_SWITCH_PARTIAL_MATCH_TABLE_SCAN|\
-                                    OPTIMIZER_SWITCH_SUBQUERY_CACHE |\
-                                    DBUG_ONLY_TABLE_ELIMINATION)
-
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_HASHED | \
+                                    OPTIMIZER_SWITCH_JOIN_CACHE_BKA)
 /*
   Replication uses 8 bytes to store SQL_MODE in the binary log. The day you
   use strictly more than 64 bits by adding one more define above, you should
@@ -230,16 +246,35 @@
   it's a constant one.
 */
 #define CONTEXT_ANALYSIS_ONLY_DERIVED 4
+/*
+  Don't evaluate constant sub-expressions of virtual column
+  expressions when opening tables
+*/ 
+#define CONTEXT_ANALYSIS_ONLY_VCOL_EXPR 8
 
-// uncachable cause
-#define UNCACHEABLE_DEPENDENT   1
+
+/*
+  Uncachable causes:
+*/
+/* This subquery has fields from outer query (put by user) */
+#define UNCACHEABLE_DEPENDENT_GENERATED   1
+/* This subquery contains functions with random result */
 #define UNCACHEABLE_RAND        2
+/* This subquery contains functions with side effect */
 #define UNCACHEABLE_SIDEEFFECT	4
-/// forcing to save JOIN for explain
+/* Forcing to save JOIN tables for explain */
 #define UNCACHEABLE_EXPLAIN     8
 /* For uncorrelated SELECT in an UNION with some correlated SELECTs */
 #define UNCACHEABLE_UNITED     16
 #define UNCACHEABLE_CHECKOPTION 32
+/*
+  This subquery has fields from outer query injected during
+  transformation process
+*/
+#define UNCACHEABLE_DEPENDENT_INJECTED  64
+/* This subquery has fields from outer query (any nature) */
+#define UNCACHEABLE_DEPENDENT (UNCACHEABLE_DEPENDENT_GENERATED | \
+                               UNCACHEABLE_DEPENDENT_INJECTED)
 
 /* Used to check GROUP BY list in the MODE_ONLY_FULL_GROUP_BY mode */
 #define UNDEF_POS (-1)
@@ -247,6 +282,11 @@
 /* BINLOG_DUMP options */
 
 #define BINLOG_DUMP_NON_BLOCK   1
+#endif /* !MYSQL_CLIENT */
+
+#define BINLOG_SEND_ANNOTATE_ROWS_EVENT   2
+
+#ifndef MYSQL_CLIENT
 
 /*
   Some defines for exit codes for ::is_equal class functions.
diff --git a/sql/sql_profile.cc b/sql/sql_profile.cc
index dad290bdd3a..dffc03fbc55 100644
--- a/sql/sql_profile.cc
+++ b/sql/sql_profile.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007, 2010 Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2007, 2010 Oracle and/or its affiliates.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -241,7 +241,7 @@ void PROF_MEASUREMENT::set_label(const char *status_arg,
 */
 void PROF_MEASUREMENT::collect()
 {
-  time_usecs= (double) my_getsystime() / 10.0;  /* 1 sec was 1e7, now is 1e6 */
+  time_usecs= my_interval_timer() / 1e3;  /* ns to us */
 #ifdef HAVE_GETRUSAGE
   getrusage(RUSAGE_SELF, &rusage);
 #elif defined(_WIN32)
diff --git a/sql/sql_reload.cc b/sql/sql_reload.cc
index 64d484c0390..7a30973699b 100644
--- a/sql/sql_reload.cc
+++ b/sql/sql_reload.cc
@@ -26,6 +26,7 @@
 #include "sql_repl.h"    // reset_master, reset_slave
 #include "debug_sync.h"
 
+static void disable_checkpoints(THD *thd);
 
 /**
   Reload/resets privileges and the different caches.
@@ -157,7 +158,7 @@ bool reload_acl_and_cache(THD *thd, unsigned long options,
 #ifdef HAVE_QUERY_CACHE
   if (options & REFRESH_QUERY_CACHE_FREE)
   {
-    query_cache.pack();				// FLUSH QUERY CACHE
+    query_cache.pack(thd);              // FLUSH QUERY CACHE
     options &= ~REFRESH_QUERY_CACHE;    // Don't flush cache, just free memory
   }
   if (options & (REFRESH_TABLES | REFRESH_QUERY_CACHE))
@@ -208,6 +209,8 @@ bool reload_acl_and_cache(THD *thd, unsigned long options,
         thd->global_read_lock.unlock_global_read_lock(thd);
         return 1;
       }
+      if (options & REFRESH_CHECKPOINT)
+        disable_checkpoints(thd);
     }
     else
     {
@@ -480,4 +483,18 @@ error:
 }
 
 
+/**
+   Disable checkpoints for all handlers
+   This is released in unlock_global_read_lock()
+*/
+
+static void disable_checkpoints(THD *thd)
+{
+  if (!thd->global_disable_checkpoint)
+  {
+    thd->global_disable_checkpoint= 1;
+    if (!global_disable_checkpoint++)
+      ha_checkpoint_state(1);                   // Disable checkpoints
+  }
+}
 
diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc
index 6a7b0b0b3ad..787912cdc4a 100644
--- a/sql/sql_rename.cc
+++ b/sql/sql_rename.cc
@@ -239,7 +239,7 @@ do_rename(THD *thd, TABLE_LIST *ren_table, char *new_db, char *new_table_name,
           char *new_table_alias, bool skip_error)
 {
   int rc= 1;
-  char name[FN_REFLEN + 1];
+  char new_name[FN_REFLEN + 1], old_name[FN_REFLEN + 1];
   const char *new_alias, *old_alias;
   frm_type_enum frm_type;
   enum legacy_db_type table_type;
@@ -258,17 +258,17 @@ do_rename(THD *thd, TABLE_LIST *ren_table, char *new_db, char *new_table_name,
   }
   DBUG_ASSERT(new_alias);
 
-  build_table_filename(name, sizeof(name) - 1,
+  build_table_filename(new_name, sizeof(new_name) - 1,
                        new_db, new_alias, reg_ext, 0);
-  if (!access(name,F_OK))
+  build_table_filename(old_name, sizeof(old_name) - 1,
+                       ren_table->db, old_alias, reg_ext, 0);
+  if (check_table_file_presence(old_name,
+                                new_name, new_db, new_alias, new_alias, TRUE))
   {
-    my_error(ER_TABLE_EXISTS_ERROR, MYF(0), new_alias);
     DBUG_RETURN(1);			// This can't be skipped
   }
-  build_table_filename(name, sizeof(name) - 1,
-                       ren_table->db, old_alias, reg_ext, 0);
 
-  frm_type= dd_frm_type(thd, name, &table_type);
+  frm_type= dd_frm_type(thd, old_name, &table_type);
   switch (frm_type)
   {
     case FRMTYPE_TABLE:
@@ -314,7 +314,7 @@ do_rename(THD *thd, TABLE_LIST *ren_table, char *new_db, char *new_table_name,
     default:
       DBUG_ASSERT(0); // should never happen
     case FRMTYPE_ERROR:
-      my_error(ER_FILE_NOT_FOUND, MYF(0), name, my_errno);
+      my_error(ER_FILE_NOT_FOUND, MYF(0), old_name, my_errno);
       break;
   }
   if (rc && !skip_error)
diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc
index 65f1d9af3cd..400ca28a277 100644
--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -1,4 +1,5 @@
-/* Copyright (C) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (C) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -41,6 +42,8 @@ static int binlog_dump_count = 0;
 */
 uint sql_slave_skip_counter;
 
+extern TYPELIB binlog_checksum_typelib;
+
 /*
     fake_rotate_event() builds a fake (=which does not exist physically in any
     binlog) Rotate event, which contains the name of the binlog we are going to
@@ -60,10 +63,21 @@ uint sql_slave_skip_counter;
 */
 
 static int fake_rotate_event(NET* net, String* packet, char* log_file_name,
-                             ulonglong position, const char** errmsg)
+                             ulonglong position, const char** errmsg,
+                             uint8 checksum_alg_arg)
 {
   DBUG_ENTER("fake_rotate_event");
   char header[LOG_EVENT_HEADER_LEN], buf[ROTATE_HEADER_LEN+100];
+
+  /*
+    this Rotate is to be sent with checksum if and only if
+    slave's get_master_version_and_clock time handshake value 
+    of master's @@global.binlog_checksum was TRUE
+  */
+
+  my_bool do_checksum= checksum_alg_arg != BINLOG_CHECKSUM_ALG_OFF &&
+    checksum_alg_arg != BINLOG_CHECKSUM_ALG_UNDEF;
+
   /*
     'when' (the timestamp) is set to 0 so that slave could distinguish between
     real and fake Rotate events (if necessary)
@@ -73,7 +87,8 @@ static int fake_rotate_event(NET* net, String* packet, char* log_file_name,
 
   char* p = log_file_name+dirname_length(log_file_name);
   uint ident_len = (uint) strlen(p);
-  ulong event_len = ident_len + LOG_EVENT_HEADER_LEN + ROTATE_HEADER_LEN;
+  ulong event_len = ident_len + LOG_EVENT_HEADER_LEN + ROTATE_HEADER_LEN +
+    (do_checksum ? BINLOG_CHECKSUM_LEN : 0);
   int4store(header + SERVER_ID_OFFSET, server_id);
   int4store(header + EVENT_LEN_OFFSET, event_len);
   int2store(header + FLAGS_OFFSET, LOG_EVENT_ARTIFICIAL_F);
@@ -84,7 +99,19 @@ static int fake_rotate_event(NET* net, String* packet, char* log_file_name,
   packet->append(header, sizeof(header));
   int8store(buf+R_POS_OFFSET,position);
   packet->append(buf, ROTATE_HEADER_LEN);
-  packet->append(p,ident_len);
+  packet->append(p, ident_len);
+
+  if (do_checksum)
+  {
+    char b[BINLOG_CHECKSUM_LEN];
+    ha_checksum crc= my_checksum(0L, NULL, 0);
+    crc= my_checksum(crc, (uchar*)header, sizeof(header));
+    crc= my_checksum(crc, (uchar*)buf, ROTATE_HEADER_LEN);
+    crc= my_checksum(crc, (uchar*)p, ident_len);
+    int4store(b, crc);
+    packet->append(b, sizeof(b));
+  }
+
   if (my_net_write(net, (uchar*) packet->ptr(), packet->length()))
   {
     *errmsg = "failed on my_net_write()";
@@ -193,6 +220,86 @@ static int send_file(THD *thd)
 }
 
 
+/**
+   Internal to mysql_binlog_send() routine that recalculates checksum for
+   a FD event (asserted) that needs additional arranment prior sending to slave.
+*/
+inline void fix_checksum(String *packet, ulong ev_offset)
+{
+  /* recalculate the crc for this event */
+  uint data_len = uint4korr(packet->ptr() + ev_offset + EVENT_LEN_OFFSET);
+  ha_checksum crc= my_checksum(0L, NULL, 0);
+  DBUG_ASSERT(data_len == 
+              LOG_EVENT_MINIMAL_HEADER_LEN + FORMAT_DESCRIPTION_HEADER_LEN +
+              BINLOG_CHECKSUM_ALG_DESC_LEN + BINLOG_CHECKSUM_LEN);
+  crc= my_checksum(crc, (uchar *)packet->ptr() + ev_offset, data_len -
+                   BINLOG_CHECKSUM_LEN);
+  int4store(packet->ptr() + ev_offset + data_len - BINLOG_CHECKSUM_LEN, crc);
+}
+
+
+static user_var_entry * get_binlog_checksum_uservar(THD * thd)
+{
+  LEX_STRING name=  { C_STRING_WITH_LEN("master_binlog_checksum")};
+  user_var_entry *entry= 
+    (user_var_entry*) my_hash_search(&thd->user_vars, (uchar*) name.str,
+                                  name.length);
+  return entry;
+}
+
+/**
+  Function for calling in mysql_binlog_send
+  to check if slave initiated checksum-handshake.
+
+  @param[in]    thd  THD to access a user variable
+
+  @return        TRUE if handshake took place, FALSE otherwise
+*/
+
+static bool is_slave_checksum_aware(THD * thd)
+{
+  DBUG_ENTER("is_slave_checksum_aware");
+  user_var_entry *entry= get_binlog_checksum_uservar(thd);
+  DBUG_RETURN(entry? true  : false);
+}
+
+/**
+  Function for calling in mysql_binlog_send
+  to get the value of @@binlog_checksum of the master at
+  time of checksum-handshake.
+
+  The value tells the master whether to compute or not, and the slave
+  to verify or not the first artificial Rotate event's checksum.
+
+  @param[in]    thd  THD to access a user variable
+
+  @return       value of @@binlog_checksum alg according to
+                @c enum enum_binlog_checksum_alg
+*/
+
+static uint8 get_binlog_checksum_value_at_connect(THD * thd)
+{
+  uint8 ret;
+
+  DBUG_ENTER("get_binlog_checksum_value_at_connect");
+  user_var_entry *entry= get_binlog_checksum_uservar(thd);
+  if (!entry)
+  {
+    ret= BINLOG_CHECKSUM_ALG_UNDEF;
+  }
+  else
+  {
+    DBUG_ASSERT(entry->type == STRING_RESULT);
+    String str;
+    uint dummy_errors;
+    str.copy(entry->value, entry->length, &my_charset_bin, &my_charset_bin,
+             &dummy_errors);
+    ret= (uint8) find_type ((char*) str.ptr(), &binlog_checksum_typelib, 1) - 1;
+    DBUG_ASSERT(ret <= BINLOG_CHECKSUM_ALG_CRC32); // while it's just on CRC32 alg
+  }
+  DBUG_RETURN(ret);
+}
+
 /*
   Adjust the position pointer in the binary log file for all running slaves
 
@@ -354,6 +461,9 @@ Increase max_allowed_packet on master";
   case LOG_READ_TRUNC:
     *errmsg = "binlog truncated in the middle of event";
     break;
+  case LOG_READ_CHECKSUM_FAILURE:
+    *errmsg = "event read from binlog did not pass crc check";
+    break;
   default:
     *errmsg = "unknown error reading log event on the master";
     break;
@@ -452,10 +562,11 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
   mysql_cond_t *log_cond;
 
   bool binlog_can_be_corrupted= FALSE;
+  uint8 current_checksum_alg= BINLOG_CHECKSUM_ALG_UNDEF;
+  int old_max_allowed_packet= thd->variables.max_allowed_packet;
 #ifndef DBUG_OFF
   int left_events = max_binlog_dump_events;
 #endif
-  int old_max_allowed_packet= thd->variables.max_allowed_packet;
   DBUG_ENTER("mysql_binlog_send");
   DBUG_PRINT("enter",("log_ident: '%s'  pos: %ld", log_ident, (long) pos));
 
@@ -571,7 +682,8 @@ impossible position";
     given that we want minimum modification of 4.0, we send the normal
     and fake Rotates.
   */
-  if (fake_rotate_event(net, packet, log_file_name, pos, &errmsg))
+  if (fake_rotate_event(net, packet, log_file_name, pos, &errmsg,
+                        get_binlog_checksum_value_at_connect(thd)))
   {
     /*
        This error code is not perfect, as fake_rotate_event() does not
@@ -607,8 +719,8 @@ impossible position";
        Try to find a Format_description_log_event at the beginning of
        the binlog
      */
-     if (!(error = Log_event::read_log_event(&log, packet, log_lock)))
-     {
+    if (!(error = Log_event::read_log_event(&log, packet, log_lock, 0)))
+    { 
        /*
          The packet has offsets equal to the normal offsets in a
          binlog event + ev_offset (the first ev_offset characters are
@@ -619,6 +731,23 @@ impossible position";
                    (*packet)[EVENT_TYPE_OFFSET+ev_offset]));
        if ((*packet)[EVENT_TYPE_OFFSET+ev_offset] == FORMAT_DESCRIPTION_EVENT)
        {
+         current_checksum_alg= get_checksum_alg(packet->ptr() + ev_offset,
+                                                packet->length() - ev_offset);
+         DBUG_ASSERT(current_checksum_alg == BINLOG_CHECKSUM_ALG_OFF ||
+                     current_checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF ||
+                     current_checksum_alg == BINLOG_CHECKSUM_ALG_CRC32);
+         if (!is_slave_checksum_aware(thd) &&
+             current_checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+             current_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF)
+         {
+           my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
+           errmsg= "Slave can not handle replication events with the checksum "
+             "that master is configured to log";
+           sql_print_warning("Master is configured to log replication events "
+                             "with checksum, but will not send such events to "
+                             "slaves that cannot process them");
+           goto err;
+         }
          binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+ev_offset] &
                                        LOG_EVENT_BINLOG_IN_USE_F);
          (*packet)[FLAGS_OFFSET+ev_offset] &= ~LOG_EVENT_BINLOG_IN_USE_F;
@@ -634,6 +763,12 @@ impossible position";
           */
          int4store((char*) packet->ptr()+LOG_EVENT_MINIMAL_HEADER_LEN+
                    ST_CREATED_OFFSET+ev_offset, (ulong) 0);
+
+	 /* fix the checksum due to latest changes in header */
+	 if (current_checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+             current_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF)
+           fix_checksum(packet, ev_offset);
+
          /* send it */
          if (my_net_write(net, (uchar*) packet->ptr(), packet->length()))
          {
@@ -678,7 +813,8 @@ impossible position";
       goto err;
 
     my_off_t prev_pos= pos;
-    while (!(error = Log_event::read_log_event(&log, packet, log_lock)))
+    while (!(error = Log_event::read_log_event(&log, packet, log_lock,
+                                               current_checksum_alg)))
     {
       prev_pos= my_b_tell(&log);
 #ifndef DBUG_OFF
@@ -696,7 +832,8 @@ impossible position";
       if (coord)
         coord->pos= uint4korr(packet->ptr() + ev_offset + LOG_POS_OFFSET);
 
-      event_type= (Log_event_type)((*packet)[LOG_EVENT_OFFSET+ev_offset]);
+      event_type=
+        (Log_event_type)((uchar)(*packet)[LOG_EVENT_OFFSET+ev_offset]);
       DBUG_EXECUTE_IF("dump_thread_wait_before_send_xid",
                       {
                         if (event_type == XID_EVENT)
@@ -706,12 +843,35 @@ impossible position";
                             "now "
                             "wait_for signal.continue";
                           DBUG_ASSERT(opt_debug_sync_timeout > 0);
-                          DBUG_ASSERT(!debug_sync_set_action(current_thd,
+                          DBUG_ASSERT(!debug_sync_set_action(thd,
                                                              STRING_WITH_LEN(act)));
+                          const char act2[]=
+                            "now "
+                            "signal signal.continued";
+                          DBUG_ASSERT(!debug_sync_set_action(current_thd,
+                                                             STRING_WITH_LEN(act2)));
                         }
                       });
       if (event_type == FORMAT_DESCRIPTION_EVENT)
       {
+        current_checksum_alg= get_checksum_alg(packet->ptr() + ev_offset,
+                                               packet->length() - ev_offset);
+        DBUG_ASSERT(current_checksum_alg == BINLOG_CHECKSUM_ALG_OFF ||
+                    current_checksum_alg == BINLOG_CHECKSUM_ALG_UNDEF ||
+                    current_checksum_alg == BINLOG_CHECKSUM_ALG_CRC32);
+        if (!is_slave_checksum_aware(thd) &&
+            current_checksum_alg != BINLOG_CHECKSUM_ALG_OFF &&
+            current_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF)
+        {
+          my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
+          errmsg= "Slave can not handle replication events with the checksum "
+            "that master is configured to log";
+          sql_print_warning("Master is configured to log replication events "
+                            "with checksum, but will not send such events to "
+                            "slaves that cannot process them");
+          goto err;
+        }
+
         binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+ev_offset] &
                                       LOG_EVENT_BINLOG_IN_USE_F);
         (*packet)[FLAGS_OFFSET+ev_offset] &= ~LOG_EVENT_BINLOG_IN_USE_F;
@@ -719,46 +879,50 @@ impossible position";
       else if (event_type == STOP_EVENT)
         binlog_can_be_corrupted= FALSE;
 
-      pos = my_b_tell(&log);
-      if (RUN_HOOK(binlog_transmit, before_send_event,
-                   (thd, flags, packet, log_file_name, pos)))
+      if (event_type != ANNOTATE_ROWS_EVENT ||
+          (flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT))
       {
-        my_errno= ER_UNKNOWN_ERROR;
-        errmsg= "run 'before_send_event' hook failed";
-        goto err;
-      }
+        pos = my_b_tell(&log);
+        if (RUN_HOOK(binlog_transmit, before_send_event,
+                     (thd, flags, packet, log_file_name, pos)))
+        {
+          my_errno= ER_UNKNOWN_ERROR;
+          errmsg= "run 'before_send_event' hook failed";
+          goto err;
+        }
 
-      if (my_net_write(net, (uchar*) packet->ptr(), packet->length()))
-      {
-	errmsg = "Failed on my_net_write()";
-	my_errno= ER_UNKNOWN_ERROR;
-	goto err;
-      }
+        if (my_net_write(net, (uchar*) packet->ptr(), packet->length()))
+        {
+          errmsg = "Failed on my_net_write()";
+          my_errno= ER_UNKNOWN_ERROR;
+          goto err;
+        }
 
-      DBUG_EXECUTE_IF("dump_thread_wait_before_send_xid",
-                      {
-                        if (event_type == XID_EVENT)
+        DBUG_EXECUTE_IF("dump_thread_wait_before_send_xid",
                         {
-                          net_flush(net);
-                        }
-                      });
-
-      DBUG_PRINT("info", ("log event code %d", event_type));
-      if (event_type == LOAD_EVENT)
-      {
-	if (send_file(thd))
-	{
-	  errmsg = "failed in send_file()";
-	  my_errno= ER_UNKNOWN_ERROR;
-	  goto err;
-	}
-      }
+                          if (event_type == XID_EVENT)
+                          {
+                            net_flush(net);
+                          }
+                        });
+
+        DBUG_PRINT("info", ("log event code %d", event_type));
+        if (event_type == LOAD_EVENT)
+        {
+          if (send_file(thd))
+          {
+            errmsg = "failed in send_file()";
+            my_errno= ER_UNKNOWN_ERROR;
+            goto err;
+          }
+        }
 
-      if (RUN_HOOK(binlog_transmit, after_send_event, (thd, flags, packet)))
-      {
-        errmsg= "Failed to run hook 'after_send_event'";
-        my_errno= ER_UNKNOWN_ERROR;
-        goto err;
+        if (RUN_HOOK(binlog_transmit, after_send_event, (thd, flags, packet)))
+        {
+          errmsg= "Failed to run hook 'after_send_event'";
+          my_errno= ER_UNKNOWN_ERROR;
+          goto err;
+        }
       }
 
       /* reset transmit packet for next loop */
@@ -771,7 +935,8 @@ impossible position";
       of a crash ?). treat any corruption as EOF
     */
     if (binlog_can_be_corrupted &&
-        error != LOG_READ_MEM && error != LOG_READ_EOF)
+        (error != LOG_READ_MEM && error != LOG_READ_CHECKSUM_FAILURE &&
+         error != LOG_READ_EOF))
     {
       my_b_seek(&log, prev_pos);
       error=LOG_READ_EOF;
@@ -835,14 +1000,16 @@ impossible position";
 	*/
 
         mysql_mutex_lock(log_lock);
-        switch (error= Log_event::read_log_event(&log, packet, (mysql_mutex_t*) 0)) {
+        switch (error= Log_event::read_log_event(&log, packet, (mysql_mutex_t*) 0,
+                                                 current_checksum_alg)) {
 	case 0:
 	  /* we read successfully, so we'll need to send it to the slave */
           mysql_mutex_unlock(log_lock);
 	  read_packet = 1;
           if (coord)
             coord->pos= uint4korr(packet->ptr() + ev_offset + LOG_POS_OFFSET);
-          event_type= (Log_event_type)((*packet)[LOG_EVENT_OFFSET+ev_offset]);
+          event_type=
+            (Log_event_type)((uchar)(*packet)[LOG_EVENT_OFFSET+ev_offset]);
 	  break;
 
 	case LOG_READ_EOF:
@@ -913,7 +1080,9 @@ impossible position";
           goto err;
 	}
 
-	if (read_packet)
+	if (read_packet &&
+            (event_type != ANNOTATE_ROWS_EVENT ||
+             (flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)))
         {
           thd_proc_info(thd, "Sending binlog event to slave");
           pos = my_b_tell(&log);
@@ -995,7 +1164,7 @@ impossible position";
       */
       if ((file=open_binlog(&log, log_file_name, &errmsg)) < 0 ||
 	  fake_rotate_event(net, packet, log_file_name, BIN_LOG_HEADER_SIZE,
-                            &errmsg))
+                            &errmsg, current_checksum_alg))
       {
 	my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
 	goto err;
@@ -1798,7 +1967,8 @@ bool mysql_show_binlog_events(THD* thd)
       This code will fail on a mixed relay log (one which has Format_desc then
       Rotate then Format_desc).
     */
-    ev= Log_event::read_log_event(&log, (mysql_mutex_t*)0, description_event);
+    ev= Log_event::read_log_event(&log, (mysql_mutex_t*)0, description_event,
+                                   opt_master_verify_checksum);
     if (ev)
     {
       if (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
@@ -1820,8 +1990,12 @@ bool mysql_show_binlog_events(THD* thd)
 
     for (event_count = 0;
          (ev = Log_event::read_log_event(&log, (mysql_mutex_t*) 0,
-                                         description_event)); )
+                                         description_event,
+                                         opt_master_verify_checksum)); )
     {
+      if (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
+        description_event->checksum_alg= ev->checksum_alg;
+
       if (event_count >= limit_start &&
 	  ev->net_send(protocol, linfo.log_file_name, pos))
       {
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index f4ca5826c65..6f85acca3e7 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -1,5 +1,5 @@
-/* Copyright (c) 2000, 2011 Oracle and/or its affiliates. All rights reserved.
-   Copyright (c) 2009-2011 Monty Program Ab
+/* Copyright (c) 2000, 2010 Oracle and/or its affiliates.
+   2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -50,6 +50,7 @@
 #include "sql_union.h"           // mysql_union
 #include "opt_subselect.h"
 #include "log_slow.h"
+#include "sql_derived.h"
 
 #include "debug_sync.h"          // DEBUG_SYNC
 #include <m_ctype.h>
@@ -60,31 +61,33 @@
 const char *join_type_str[]={ "UNKNOWN","system","const","eq_ref","ref",
 			      "MAYBE_REF","ALL","range","index","fulltext",
 			      "ref_or_null","unique_subquery","index_subquery",
-                              "index_merge"
-};
+                              "index_merge", "hash_ALL", "hash_range",
+                              "hash_index", "hash_index_merge" };
+
+const char *copy_to_tmp_table= "Copying to tmp table";
 
 struct st_sargable_param;
 
 static void optimize_keyuse(JOIN *join, DYNAMIC_ARRAY *keyuse_array);
-static bool make_join_statistics(JOIN *join, TABLE_LIST *leaves, COND *conds,
-				 DYNAMIC_ARRAY *keyuse);
+static bool make_join_statistics(JOIN *join, List<TABLE_LIST> &leaves, 
+                                 COND *conds, DYNAMIC_ARRAY *keyuse);
 static bool update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,
                                 JOIN_TAB *join_tab,
                                 uint tables, COND *conds,
-                                COND_EQUAL *cond_equal,
                                 table_map table_map, SELECT_LEX *select_lex,
                                 st_sargable_param **sargables);
+static bool sort_and_filter_keyuse(THD *thd, DYNAMIC_ARRAY *keyuse,
+                                   bool skip_unprefixed_keyparts);
 static int sort_keyuse(KEYUSE *a,KEYUSE *b);
 static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
 			       table_map used_tables);
-
 void best_access_path(JOIN *join, JOIN_TAB *s, 
                              table_map remaining_tables, uint idx, 
                              bool disable_jbuf, double record_count,
                              POSITION *pos, POSITION *loose_scan_pos);
 static void optimize_straight_join(JOIN *join, table_map join_tables);
 static bool greedy_search(JOIN *join, table_map remaining_tables,
-                             uint depth, uint prune_level);
+                          uint depth, uint prune_level);
 static bool best_extension_by_limited_search(JOIN *join,
                                              table_map remaining_tables,
                                              uint idx, double record_count,
@@ -103,23 +106,24 @@ C_MODE_END
 static bool find_best(JOIN *join,table_map rest_tables,uint index,
 		      double record_count,double read_time);
 static uint cache_record_length(JOIN *join,uint index);
-static double prev_record_reads(JOIN *join, uint idx, table_map found_ref);
 static bool get_best_combination(JOIN *join);
 static store_key *get_store_key(THD *thd,
 				KEYUSE *keyuse, table_map used_tables,
 				KEY_PART_INFO *key_part, uchar *key_buff,
 				uint maybe_null);
-static void make_outerjoin_info(JOIN *join);
+static bool make_outerjoin_info(JOIN *join);
 static Item*
 make_cond_after_sjm(Item *root_cond, Item *cond, table_map tables, table_map sjm_tables);
 static bool make_join_select(JOIN *join,SQL_SELECT *select,COND *item);
+static void revise_cache_usage(JOIN_TAB *join_tab);
 static bool make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after);
 static bool only_eq_ref_tables(JOIN *join, ORDER *order, table_map tables);
 static void update_depend_map(JOIN *join);
-static void update_depend_map(JOIN *join, ORDER *order);
+static void update_depend_map_for_order(JOIN *join, ORDER *order);
 static ORDER *remove_const(JOIN *join,ORDER *first_order,COND *cond,
 			   bool change_list, bool *simple_order);
-static int return_zero_rows(JOIN *join, select_result *res,TABLE_LIST *tables,
+static int return_zero_rows(JOIN *join, select_result *res, 
+                            List<TABLE_LIST> &tables,
                             List<Item> &fields, bool send_row,
                             ulonglong select_options, const char *info,
                             Item *having);
@@ -184,13 +188,18 @@ static int join_ft_read_first(JOIN_TAB *tab);
 static int join_ft_read_next(READ_RECORD *info);
 int join_read_always_key_or_null(JOIN_TAB *tab);
 int join_read_next_same_or_null(READ_RECORD *info);
-static COND *make_cond_for_table(Item *cond,table_map table,
-				 table_map used_table,
-                                 bool exclude_expensive_cond);
-static COND *make_cond_for_table_from_pred(Item *root_cond, Item *cond,
+static COND *make_cond_for_table(THD *thd, Item *cond,table_map table,
+                                 table_map used_table,
+                                 uint join_tab_idx_arg,
+                                 bool exclude_expensive_cond,
+                                 bool retain_ref_cond);
+static COND *make_cond_for_table_from_pred(THD *thd, Item *root_cond,
+                                           Item *cond,
                                            table_map tables,
                                            table_map used_table,
-                                           bool exclude_expensive_cond);
+                                           uint join_tab_idx_arg,
+                                           bool exclude_expensive_cond,
+                                           bool retain_ref_cond);
 
 static Item* part_of_refkey(TABLE *form,Field *field);
 uint find_shortest_key(TABLE *table, const key_map *usable_keys);
@@ -227,7 +236,7 @@ static ORDER *create_distinct_group(THD *thd, Item **ref_pointer_array,
                                     List<Item> &all_fields,
 				    bool *all_order_by_fields_used);
 static bool test_if_subpart(ORDER *a,ORDER *b);
-static TABLE *get_sort_by_table(ORDER *a,ORDER *b,TABLE_LIST *tables);
+static TABLE *get_sort_by_table(ORDER *a,ORDER *b,List<TABLE_LIST> &tables);
 static void calc_group_buffer(JOIN *join,ORDER *group);
 static bool make_group_fields(JOIN *main_join, JOIN *curr_join);
 static bool alloc_group_fields(JOIN *join,ORDER *group);
@@ -252,14 +261,13 @@ static bool update_sum_func(Item_sum **func);
 static void select_describe(JOIN *join, bool need_tmp_table,bool need_order,
 			    bool distinct, const char *message=NullS);
 static void add_group_and_distinct_keys(JOIN *join, JOIN_TAB *join_tab);
-void get_partial_join_cost(JOIN *join, uint idx, double *read_time_arg,
-                           double *record_count_arg);
 static uint make_join_orderinfo(JOIN *join);
-static int
-join_read_record_no_init(JOIN_TAB *tab);
+static bool generate_derived_keys(DYNAMIC_ARRAY *keyuse_array);
 
 Item_equal *find_item_equal(COND_EQUAL *cond_equal, Field *field,
                             bool *inherited_fl);
+JOIN_TAB *first_depth_first_tab(JOIN* join);
+JOIN_TAB *next_depth_first_tab(JOIN* join, JOIN_TAB* tab);
 
 /**
   This handles SELECT with and without UNION.
@@ -447,7 +455,7 @@ fix_inner_refs(THD *thd, List<Item> &all_fields, SELECT_LEX *select,
 */
 inline int setup_without_group(THD *thd, Item **ref_pointer_array,
 			       TABLE_LIST *tables,
-			       TABLE_LIST *leaves,
+			       List<TABLE_LIST> &leaves,
 			       List<Item> &fields,
 			       List<Item> &all_fields,
 			       COND **conds,
@@ -525,29 +533,70 @@ JOIN::prepare(Item ***rref_pointer_array,
   join_list= &select_lex->top_join_list;
   union_part= unit_arg->is_union();
 
+  if (select_lex->handle_derived(thd->lex, DT_PREPARE))
+    DBUG_RETURN(1);
+
   thd->lex->current_select->is_item_list_lookup= 1;
   /*
     If we have already executed SELECT, then it have not sense to prevent
     its table from update (see unique_table())
+    Affects only materialized derived tables.
   */
-  if (thd->derived_tables_processing)
-    select_lex->exclude_from_table_unique_test= TRUE;
-
   /* Check that all tables, fields, conds and order are ok */
-
   if (!(select_options & OPTION_SETUP_TABLES_DONE) &&
       setup_tables_and_check_access(thd, &select_lex->context, join_list,
-                                    tables_list, &select_lex->leaf_tables,
-                                    FALSE, SELECT_ACL, SELECT_ACL))
+                                    tables_list, select_lex->leaf_tables,
+                                    FALSE, SELECT_ACL, SELECT_ACL, FALSE))
       DBUG_RETURN(-1);
+  
+  /*
+    TRUE if the SELECT list mixes elements with and without grouping,
+    and there is no GROUP BY clause. Mixing non-aggregated fields with
+    aggregate functions in the SELECT list is a MySQL exptenstion that
+    is allowed only if the ONLY_FULL_GROUP_BY sql mode is not set.
+  */
+  bool mixed_implicit_grouping= false;
+  if ((~thd->variables.sql_mode & MODE_ONLY_FULL_GROUP_BY) &&
+      select_lex->with_sum_func && !group_list)
+  {
+    List_iterator_fast <Item> select_it(fields_list);
+    Item *select_el; /* Element of the SELECT clause, can be an expression. */
+    bool found_field_elem= false;
+    bool found_sum_func_elem= false;
+
+    while ((select_el= select_it++))
+    {
+      if (select_el->with_sum_func)
+        found_sum_func_elem= true;
+      if (select_el->with_field)
+        found_field_elem= true;
+      if (found_sum_func_elem && found_field_elem)
+      {
+        mixed_implicit_grouping= true;
+        break;
+      }
+    }
+  }
+
+  table_count= select_lex->leaf_tables.elements;
  
-  TABLE_LIST *table_ptr;
-  for (table_ptr= select_lex->leaf_tables;
-       table_ptr;
-       table_ptr= table_ptr->next_leaf)
-    tables++;
+  TABLE_LIST *tbl;
+  List_iterator_fast<TABLE_LIST> li(select_lex->leaf_tables);
+  while ((tbl= li++))
+  {
+    //table_count++; /* Count the number of tables in the join. */
+    /*
+      If the query uses implicit grouping where the select list contains both
+      aggregate functions and non-aggregate fields, any non-aggregated field
+      may produce a NULL value. Set all fields of each table as nullable before
+      semantic analysis to take into account this change of nullability.
+    */
+    if (mixed_implicit_grouping)
+      tbl->table->maybe_null= 1;
+  }
 
-  if (setup_wild(thd, tables_list, fields_list, &all_fields, wild_num) ||
+  if ((wild_num && setup_wild(thd, tables_list, fields_list, &all_fields,
+                              wild_num)) ||
       select_lex->setup_ref_array(thd, og_num) ||
       setup_fields(thd, (*rref_pointer_array), fields_list, MARK_COLUMNS_READ,
 		   &all_fields, 1) ||
@@ -565,6 +614,13 @@ JOIN::prepare(Item ***rref_pointer_array,
     thd->where="having clause";
     thd->lex->allow_sum_func|= 1 << select_lex_arg->nest_level;
     select_lex->having_fix_field= 1;
+    /*
+      Wrap alone field in HAVING clause in case it will be outer field of subquery
+      which need persistent pointer on it, but having could be changed by optimizer
+    */
+    if (having->type() == Item::REF_ITEM &&
+        ((Item_ref *)having)->ref_type() == Item_ref::REF)
+      wrap_ident(thd, &having);
     bool having_fix_rc= (!having->fixed &&
 			 (having->fix_fields(thd, &having) ||
 			  having->check_cols(1)));
@@ -654,10 +710,6 @@ JOIN::prepare(Item ***rref_pointer_array,
     }
   }
 
-  if (setup_ftfuncs(select_lex)) /* should be after having->fix_fields */
-    DBUG_RETURN(-1);
-  
-
   /*
     Check if there are references to un-aggregated columns when computing 
     aggregate functions with implicit grouping (there is no GROUP BY).
@@ -716,11 +768,36 @@ JOIN::prepare(Item ***rref_pointer_array,
   if (!procedure && result && result->prepare(fields_list, unit_arg))
     goto err;					/* purecov: inspected */
 
+  unit= unit_arg;
+  if (prepare_stage2())
+    goto err;
+
+  DBUG_RETURN(0); // All OK
+
+err:
+  delete procedure;                /* purecov: inspected */
+  procedure= 0;
+  DBUG_RETURN(-1);                /* purecov: inspected */
+}
+
+
+/**
+  Second phase of prepare where we collect some statistic.
+
+  @details
+  We made this part separate to be able recalculate some statistic after
+  transforming subquery on optimization phase.
+*/
+
+bool JOIN::prepare_stage2()
+{
+  bool res= TRUE;
+  DBUG_ENTER("JOIN::prepare_stage2");
+
   /* Init join struct */
   count_field_types(select_lex, &tmp_table_param, all_fields, 0);
   ref_pointer_array_size= all_fields.elements*sizeof(Item*);
   this->group= group_list != 0;
-  unit= unit_arg;
 
   if (tmp_table_param.sum_func_count && !group_list)
     implicit_grouping= TRUE;
@@ -737,15 +814,64 @@ JOIN::prepare(Item ***rref_pointer_array,
   if (alloc_func_list())
     goto err;
 
-  DBUG_RETURN(0); // All OK
-
+  res= FALSE;
 err:
-  delete procedure;				/* purecov: inspected */
-  procedure= 0;
-  DBUG_RETURN(-1);				/* purecov: inspected */
+  DBUG_RETURN(res);				/* purecov: inspected */
 }
 
 
+void
+inject_jtbm_conds(JOIN *join, List<TABLE_LIST> *join_list, Item **join_where)
+{
+  TABLE_LIST *table;
+  NESTED_JOIN *nested_join;
+  List_iterator<TABLE_LIST> li(*join_list);
+  DBUG_ENTER("inject_jtbm_conds");
+
+  
+  while ((table= li++))
+  {
+    Item_in_subselect *item;
+    
+    if ((item= table->jtbm_subselect))
+    {
+      Item_in_subselect *subq_pred= item;
+      double rows;
+      double read_time;
+
+      subq_pred->in_strategy &= ~SUBS_IN_TO_EXISTS;
+      subq_pred->optimize(&rows, &read_time);
+
+      subq_pred->jtbm_read_time= read_time;
+      subq_pred->jtbm_record_count=rows;
+      subq_pred->is_jtbm_merged= TRUE;
+
+      subselect_hash_sj_engine *hash_sj_engine=
+        ((subselect_hash_sj_engine*)item->engine);
+      
+      
+      //repeat of convert_subq_to_jtbm:
+      table->table= hash_sj_engine->tmp_table;
+      table->table->pos_in_table_list= table;
+
+      setup_table_map(table->table, table, table->jtbm_table_no);
+
+      Item *sj_conds= hash_sj_engine->semi_join_conds;
+
+      (*join_where)= and_items(*join_where, sj_conds);
+      if (!(*join_where)->fixed)
+        (*join_where)->fix_fields(join->thd, join_where);
+      //parent_join->select_lex->where= parent_join->conds;
+    }
+
+    if ((nested_join= table->nested_join))
+    {
+      inject_jtbm_conds(join, &nested_join->join_list, join_where);
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
 /**
   global select optimisation.
 
@@ -761,11 +887,11 @@ err:
 int
 JOIN::optimize()
 {
-  bool need_distinct= TRUE;
   ulonglong select_opts_for_readinfo;
   uint no_jbuf_after;
-
   DBUG_ENTER("JOIN::optimize");
+
+  do_send_rows = (unit->select_limit_cnt) ? 1 : 0;
   // to prevent double initialization on EXPLAIN
   if (optimized)
     DBUG_RETURN(0);
@@ -774,10 +900,45 @@ JOIN::optimize()
 
   thd_proc_info(thd, "optimizing");
 
-  /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
-  if (convert_join_subqueries_to_semijoins(this))
+  set_allowed_join_cache_types();
+  need_distinct= TRUE;
+
+  /* Run optimize phase for all derived tables/views used in this SELECT. */
+  if (select_lex->handle_derived(thd->lex, DT_OPTIMIZE))
+    DBUG_RETURN(1);
+
+  if (select_lex->first_cond_optimization)
+  {
+    //Do it only for the first execution
+    /* Merge all mergeable derived tables/views in this SELECT. */
+    if (select_lex->handle_derived(thd->lex, DT_MERGE))
+      DBUG_RETURN(TRUE);  
+    table_count= select_lex->leaf_tables.elements;
+    select_lex->update_used_tables();
+  }
+
+  if (transform_max_min_subquery())
     DBUG_RETURN(1); /* purecov: inspected */
-  /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
+
+  if (select_lex->first_cond_optimization)
+  {
+    /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
+    if (convert_join_subqueries_to_semijoins(this))
+      DBUG_RETURN(1); /* purecov: inspected */
+    /* dump_TABLE_LIST_graph(select_lex, select_lex->leaf_tables); */
+    select_lex->update_used_tables();
+
+    /* Save this info for the next executions */
+    if (select_lex->save_leaf_tables(thd))
+      DBUG_RETURN(1);
+  }
+  
+  eval_select_list_used_tables();
+  
+  table_count= select_lex->leaf_tables.elements;
+
+  if (setup_ftfuncs(select_lex)) /* should be after having->fix_fields */
+    DBUG_RETURN(-1);
 
   row_limit= ((select_distinct || order || group_list) ? HA_POS_ERROR :
 	      unit->select_limit_cnt);
@@ -785,7 +946,6 @@ JOIN::optimize()
   select_limit= unit->select_limit_cnt;
   if (having || (select_options & OPTION_FOUND_ROWS))
     select_limit= HA_POS_ERROR;
-  do_send_rows = (unit->select_limit_cnt) ? 1 : 0;
   // Ignore errors of execution if option IGNORE present
   if (thd->lex->ignore)
     thd->lex->current_select->no_error= 1;
@@ -811,7 +971,8 @@ JOIN::optimize()
     }
   }
 #endif
-  SELECT_LEX *sel= thd->lex->current_select;
+
+  SELECT_LEX *sel= select_lex;
   if (sel->first_cond_optimization)
   {
     /*
@@ -833,11 +994,16 @@ JOIN::optimize()
 
     sel->prep_where= conds ? conds->copy_andor_structure(thd) : 0;
 
+    sel->where= conds;
+
     if (arena)
       thd->restore_active_arena(arena, &backup);
   }
+  
+  inject_jtbm_conds(this, join_list, &conds);
 
-  conds= optimize_cond(this, conds, join_list, &cond_value, &cond_equal);   
+  conds= optimize_cond(this, conds, join_list, &cond_value, &cond_equal);
+     
   if (thd->is_error())
   {
     error= 1;
@@ -854,10 +1020,17 @@ JOIN::optimize()
       DBUG_RETURN(1);
     }
     if (select_lex->where)
+    {
       select_lex->cond_value= cond_value;
+      if (sel->where != conds && cond_value == Item::COND_OK)
+        thd->change_item_tree(&sel->where, conds);
+    }  
     if (select_lex->having)
+    {
       select_lex->having_value= having_value;
-
+      if (sel->having != having && having_value == Item::COND_OK)
+        thd->change_item_tree(&sel->having, having);    
+    }
     if (cond_value == Item::COND_FALSE || having_value == Item::COND_FALSE || 
         (!unit->select_limit_cnt && !(select_options & OPTION_FOUND_ROWS)))
     {						/* Impossible cond */
@@ -865,7 +1038,7 @@ JOIN::optimize()
                             "Impossible HAVING" : "Impossible WHERE"));
       zero_result_cause=  having_value == Item::COND_FALSE ?
                            "Impossible HAVING" : "Impossible WHERE";
-      tables= 0;
+      table_count= top_join_tab_count= 0;
       error= 0;
       goto setup_subq_exit;
     }
@@ -874,7 +1047,8 @@ JOIN::optimize()
 #ifdef WITH_PARTITION_STORAGE_ENGINE
   {
     TABLE_LIST *tbl;
-    for (tbl= select_lex->leaf_tables; tbl; tbl= tbl->next_leaf)
+    List_iterator_fast<TABLE_LIST> li(select_lex->leaf_tables);
+    while ((tbl= li++))
     {
       /* 
         If tbl->embedding!=NULL that means that this table is in the inner
@@ -911,11 +1085,12 @@ JOIN::optimize()
     */
     if ((res=opt_sum_query(thd, select_lex->leaf_tables, all_fields, conds)))
     {
+      DBUG_ASSERT(res >= 0);
       if (res == HA_ERR_KEY_NOT_FOUND)
       {
         DBUG_PRINT("info",("No matching min/max row"));
 	zero_result_cause= "No matching min/max row";
-        tables= 0;
+        table_count= top_join_tab_count= 0;
 	error=0;
         goto setup_subq_exit;
       }
@@ -925,18 +1100,11 @@ JOIN::optimize()
         DBUG_PRINT("error",("Error from opt_sum_query"));
         DBUG_RETURN(1);
       }
-      if (res < 0)
-      {
-        DBUG_PRINT("info",("No matching min/max row"));
-        zero_result_cause= "No matching min/max row";
-        tables= 0;
-        error=0;
-        goto setup_subq_exit;
-      }
+
       DBUG_PRINT("info",("Select tables optimized away"));
       zero_result_cause= "Select tables optimized away";
       tables_list= 0;				// All tables resolved
-      const_tables= tables;
+      const_tables= top_join_tab_count= table_count;
       /*
         Extract all table-independent conditions and replace the WHERE
         clause with them. All other conditions were computed by opt_sum_query
@@ -950,24 +1118,21 @@ JOIN::optimize()
       if (conds && !(thd->lex->describe & DESCRIBE_EXTENDED))
       {
         COND *table_independent_conds=
-          make_cond_for_table(conds, PSEUDO_TABLE_BITS, 0, FALSE);
+          make_cond_for_table(thd, conds, PSEUDO_TABLE_BITS, 0, MAX_TABLES,
+                              FALSE, FALSE);
         DBUG_EXECUTE("where",
                      print_where(table_independent_conds,
                                  "where after opt_sum_query()",
                                  QT_ORDINARY););
         conds= table_independent_conds;
       }
-      goto setup_subq_exit;
     }
   }
   if (!tables_list)
   {
     DBUG_PRINT("info",("No tables"));
     error= 0;
-    /* Create all structures needed for materialized subquery execution. */
-    if (setup_subquery_materialization())
-      DBUG_RETURN(1);
-    DBUG_RETURN(0);
+    goto setup_subq_exit;
   }
   error= -1;					// Error is sent to client
   sort_by_table= get_sort_by_table(order, group_list, select_lex->leaf_tables);
@@ -981,6 +1146,9 @@ JOIN::optimize()
     DBUG_RETURN(1);
   }
 
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_DERIVED_WITH_KEYS))
+    drop_unused_derived_keys();
+
   if (rollup.state != ROLLUP::STATE_NONE)
   {
     if (rollup_process_const_fields())
@@ -992,7 +1160,7 @@ JOIN::optimize()
   else
   {
     /* Remove distinct if only const tables */
-    select_distinct= select_distinct && (const_tables != tables);
+    select_distinct= select_distinct && (const_tables != table_count);
   }
 
   thd_proc_info(thd, "preparing");
@@ -1022,13 +1190,13 @@ JOIN::optimize()
   }
   if (const_tables && !thd->locked_tables_mode &&
       !(select_options & SELECT_NO_UNLOCK))
-    mysql_unlock_some_tables(thd, all_tables, const_tables);
+    mysql_unlock_some_tables(thd, table, const_tables);
   if (!conds && outer_join)
   {
     /* Handle the case where we have an OUTER JOIN without a WHERE */
     conds=new Item_int((longlong) 1,1);	// Always true
   }
-  select= make_select(*all_tables, const_table_map,
+  select= make_select(*table, const_table_map,
                       const_table_map, conds, 1, &error);
   if (error)
   {						/* purecov: inspected */
@@ -1038,7 +1206,10 @@ JOIN::optimize()
   }
   
   reset_nj_counters(this, join_list);
-  make_outerjoin_info(this);
+  if (make_outerjoin_info(this))
+  {
+    DBUG_RETURN(1);
+  }
 
   /*
     Among the equal fields belonging to the same multiple equality
@@ -1057,10 +1228,11 @@ JOIN::optimize()
   }
 
   /*
-    Permorm the the optimization on fields evaluation mentioned above
+    Perform the optimization on fields evaluation mentioned above
     for all on expressions.
-  */ 
-  for (JOIN_TAB *tab= join_tab + const_tables; tab < join_tab + tables ; tab++)
+  */
+  for (JOIN_TAB *tab= first_linear_tab(this, WITHOUT_CONST_TABLES); tab;
+       tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
   {
     if (*tab->on_expr_ref)
     {
@@ -1071,6 +1243,39 @@ JOIN::optimize()
     }
   }
 
+  /*
+    Perform the optimization on fields evaliation mentioned above
+    for all used ref items.
+  */
+  for (JOIN_TAB *tab= first_linear_tab(this, WITHOUT_CONST_TABLES); tab;
+       tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
+  {
+    uint key_copy_index=0;
+    for (uint i=0; i < tab->ref.key_parts; i++)
+    {
+      Item **ref_item_ptr= tab->ref.items+i;
+      Item *ref_item= *ref_item_ptr;
+      if (!ref_item->used_tables() && !(select_options & SELECT_DESCRIBE))
+        continue;
+      COND_EQUAL *equals= tab->first_inner ? tab->first_inner->cond_equal : 
+                                             cond_equal;
+      ref_item= substitute_for_best_equal_field(ref_item, equals, map2table);
+      ref_item->update_used_tables();
+      if (*ref_item_ptr != ref_item)
+      {
+        *ref_item_ptr= ref_item;
+        Item *item= ref_item->real_item();
+        store_key *key_copy= tab->ref.key_copy[key_copy_index];
+        if (key_copy->type() == store_key::FIELD_STORE_KEY)
+        {
+          store_key_field *field_copy= ((store_key_field *)key_copy);
+          field_copy->change_source_field((Item_field *) item);
+        }
+      }
+      key_copy_index++;
+    }
+  }
+
   if (conds && const_table_map != found_const_table_map &&
       (select_options & SELECT_DESCRIBE))
   {
@@ -1084,6 +1289,7 @@ JOIN::optimize()
   {
     zero_result_cause=
       "Impossible WHERE noticed after reading const tables";
+    select_lex->mark_const_derived(zero_result_cause);
     goto setup_subq_exit;
   }
 
@@ -1119,7 +1325,7 @@ JOIN::optimize()
 
      The FROM clause must contain a single non-constant table.
   */
-  if (tables - const_tables == 1 && (group_list || select_distinct) &&
+  if (table_count - const_tables == 1 && (group_list || select_distinct) &&
       !tmp_table_param.sum_func_count &&
       (!join_tab[const_tables].select ||
        !join_tab[const_tables].select->quick ||
@@ -1170,7 +1376,7 @@ JOIN::optimize()
     if (! hidden_group_fields && rollup.state == ROLLUP::STATE_NONE)
       select_distinct=0;
   }
-  else if (select_distinct && tables - const_tables == 1 &&
+  else if (select_distinct && table_count - const_tables == 1 &&
            rollup.state == ROLLUP::STATE_NONE)
   {
     /*
@@ -1303,7 +1509,7 @@ JOIN::optimize()
     When the WITH ROLLUP modifier is present, we cannot skip temporary table
     creation for the DISTINCT clause just because there are only const tables.
   */
-  need_tmp= ((const_tables != tables &&
+  need_tmp= ((const_tables != table_count &&
 	     ((select_distinct || !simple_order || !simple_group) ||
 	      (group_list && order) ||
 	      test(select_options & OPTION_BUFFER_RESULT))) ||
@@ -1317,7 +1523,7 @@ JOIN::optimize()
     Yet the current implementation of FORCE INDEX hints does not
     allow us to do it in a clean manner.
   */
-  no_jbuf_after= 1 ? tables : make_join_orderinfo(this);
+  no_jbuf_after= 1 ? table_count : make_join_orderinfo(this);
 
   // Don't use join buffering when we use MATCH
   select_opts_for_readinfo=
@@ -1331,8 +1537,7 @@ JOIN::optimize()
   if (!(select_options & SELECT_DESCRIBE))
     init_ftfuncs(thd, select_lex, test(order));
 
-  /* Create all structures needed for materialized subquery execution. */
-  if (setup_subquery_materialization())
+  if (optimize_unflattened_subqueries())
     DBUG_RETURN(1);
   
   int res;
@@ -1349,13 +1554,16 @@ JOIN::optimize()
   */
   if (need_tmp || select_distinct || group_list || order)
   {
-    for (uint i = const_tables; i < tables; i++)
-      join_tab[i].table->prepare_for_position();
+    for (uint i= 0; i < table_count; i++)
+    {
+      if (!(table[i]->map & const_table_map))
+        table[i]->prepare_for_position();
+    }
   }
 
   DBUG_EXECUTE("info",TEST_join(this););
 
-  if (const_tables != tables)
+  if (const_tables != table_count)
   {
     /*
       Because filesort always does a full table scan or a quick range scan
@@ -1418,7 +1626,7 @@ JOIN::optimize()
   if (select_options & SELECT_DESCRIBE)
   {
     error= 0;
-    DBUG_RETURN(0);
+    goto derived_exit;
   }
   having= 0;
 
@@ -1443,6 +1651,40 @@ JOIN::optimize()
     }
   }
 
+  error= 0;
+  DBUG_RETURN(0);
+
+setup_subq_exit:
+  /* Choose an execution strategy for this JOIN. */
+  if (!tables_list || !table_count)
+    choose_tableless_subquery_plan();
+  /*
+    Even with zero matching rows, subqueries in the HAVING clause may
+    need to be evaluated if there are aggregate functions in the query.
+  */
+  if (optimize_unflattened_subqueries())
+    DBUG_RETURN(1);
+  error= 0;
+
+derived_exit:
+  select_lex->mark_const_derived(zero_result_cause);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Create and initialize objects neeed for the execution of a query plan.
+  Evaluate constant expressions not evaluated during optimization.
+*/
+
+int JOIN::init_execution()
+{
+  DBUG_ENTER("JOIN::init_execution");
+
+  DBUG_ASSERT(optimized);
+  DBUG_ASSERT(!(select_options & SELECT_DESCRIBE));
+  initialized= true;
+
   /* Create a tmp table if distinct or if the sort is too complicated */
   if (need_tmp)
   {
@@ -1533,8 +1775,8 @@ JOIN::optimize()
 
     if (exec_tmp_table1->distinct)
     {
-      table_map used_tables= thd->used_tables;
-      JOIN_TAB *last_join_tab= join_tab+tables-1;
+      table_map used_tables= select_list_used_tables;
+      JOIN_TAB *last_join_tab= join_tab + top_join_tab_count - 1;
       do
       {
 	if (used_tables & last_join_tab->table->map)
@@ -1558,19 +1800,6 @@ JOIN::optimize()
       DBUG_RETURN(-1);                         /* purecov: inspected */
   }
 
-  error= 0;
-  DBUG_RETURN(0);
-
-setup_subq_exit:
-  /*
-    Even with zero matching rows, subqueries in the HAVING clause may
-    need to be evaluated if there are aggregate functions in the
-    query. If we have planned to materialize the subquery, we need to
-    set it up properly before prematurely leaving optimize().
-  */
-  if (setup_subquery_materialization())
-    DBUG_RETURN(1);
-  error= 0;
   DBUG_RETURN(0);
 }
 
@@ -1605,9 +1834,8 @@ bool JOIN::setup_subquery_caches()
     if (conds)
       conds= conds->transform(&Item::expr_cache_insert_transformer,
                               (uchar*) thd);
-    for (JOIN_TAB *tab= join_tab + const_tables;
-         tab < join_tab + tables ;
-         tab++)
+    for (JOIN_TAB *tab= first_linear_tab(this, WITHOUT_CONST_TABLES); 
+         tab; tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
     {
       if (tab->select_cond)
         tab->select_cond=
@@ -1675,6 +1903,62 @@ void JOIN::restore_tmp()
 }
 
 
+/*
+  Shrink join buffers used for preceding tables to reduce the occupied space
+
+  SYNOPSIS
+    shrink_join_buffers()
+      jt           table up to which the buffers are to be shrunk
+      curr_space   the size of the space used by the buffers for tables 1..jt
+      needed_space the size of the space that has to be used by these buffers
+
+  DESCRIPTION
+    The function makes an attempt to shrink all join buffers used for the
+    tables starting from the first up to jt to reduce the total size of the
+    space occupied by the buffers used for tables 1,...,jt  from curr_space
+    to needed_space.
+    The function assumes that the buffer for the table jt has not been
+    allocated yet.
+
+  RETURN
+    FALSE     if all buffer have been successfully shrunk
+    TRUE      otherwise
+*/
+  
+bool JOIN::shrink_join_buffers(JOIN_TAB *jt, 
+                               ulonglong curr_space,
+                               ulonglong needed_space)
+{
+  JOIN_CACHE *cache;
+  for (JOIN_TAB *tab= join_tab+const_tables; tab < jt; tab++)
+  {
+    cache= tab->cache;
+    if (cache)
+    { 
+      size_t buff_size;
+      if (needed_space < cache->get_min_join_buffer_size())
+        return TRUE;
+      if (cache->shrink_join_buffer_in_ratio(curr_space, needed_space))
+      { 
+        revise_cache_usage(tab);
+        return TRUE;
+      }
+      buff_size= cache->get_join_buffer_size();
+      curr_space-= buff_size;
+      needed_space-= buff_size;
+    }
+  }
+
+  cache= jt->cache;
+  DBUG_ASSERT(cache);
+  if (needed_space < cache->get_min_join_buffer_size())
+    return TRUE;
+  cache->set_join_buffer_size((size_t)needed_space);
+  
+  return FALSE;
+}
+
+
 int
 JOIN::reinit()
 {
@@ -1705,12 +1989,17 @@ JOIN::reinit()
     set_items_ref_array(items0);
 
   if (join_tab_save)
-    memcpy(join_tab, join_tab_save, sizeof(JOIN_TAB) * tables);
+    memcpy(join_tab, join_tab_save, sizeof(JOIN_TAB) * table_count);
 
   /* need to reset ref access state (see join_read_key) */
   if (join_tab)
-    for (uint i= 0; i < tables; i++)
-      join_tab[i].ref.key_err= TRUE;
+  {
+    for (JOIN_TAB *tab= first_linear_tab(this, WITH_CONST_TABLES); tab; 
+         tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
+    {
+      tab->ref.key_err= TRUE;
+    }
+  }
 
   if (tmp_join)
     restore_tmp();
@@ -1728,11 +2017,10 @@ JOIN::reinit()
     /* Reset effect of possible no_rows_in_result() */
     List_iterator_fast<Item> it(fields_list);
     Item *item;
-
     no_rows_in_result_called= 0;
     while ((item= it++))
       item->restore_to_before_no_rows_in_result();
-  }  
+  }
 
   if (!(select_options & SELECT_DESCRIBE))
     init_ftfuncs(thd, select_lex, test(order));
@@ -1768,7 +2056,7 @@ JOIN::save_join_tab()
   if (!join_tab_save && select_lex->master_unit()->uncacheable)
   {
     if (!(join_tab_save= (JOIN_TAB*)thd->memdup((uchar*) join_tab,
-						sizeof(JOIN_TAB) * tables)))
+						sizeof(JOIN_TAB) * table_count)))
       return 1;
   }
   return 0;
@@ -1808,7 +2096,7 @@ JOIN::exec()
   }
   (void) result->prepare2(); // Currently, this cannot fail.
 
-  if (!tables_list && (tables || !select_lex->with_sum_func))
+  if (!tables_list && (table_count || !select_lex->with_sum_func))
   {                                           // Only test of functions
     if (select_options & SELECT_DESCRIBE)
       select_describe(this, FALSE, FALSE, FALSE,
@@ -1836,7 +2124,7 @@ JOIN::exec()
       {
 	if (do_send_rows &&
             (procedure ? (procedure->send_row(procedure_fields_list) ||
-             procedure->end_of_records()) : result->send_data(fields_list)))
+             procedure->end_of_records()) : result->send_data(fields_list)> 0))
 	  error= 1;
 	else
 	{
@@ -1861,9 +2149,19 @@ JOIN::exec()
     FOUND_ROWS() may be called. Never reset the examined row count here.
     It must be accumulated from all join iterations of all join parts.
   */
-  if (tables)
+  if (table_count)
     thd->limit_found_rows= 0;
 
+  /*
+    Evaluate expensive constant conditions that were not evaluated during
+    optimization. Do not evaluate them for EXPLAIN statements as these
+    condtions may be arbitrarily costly, and because the optimize phase
+    might not have produced a complete executable plan for EXPLAINs.
+  */
+  if (exec_const_cond && !(select_options & SELECT_DESCRIBE) &&
+      !exec_const_cond->val_int())
+    zero_result_cause= "Impossible WHERE noticed after reading const tables";
+
   if (zero_result_cause)
   {
     (void) return_zero_rows(this, result, select_lex->leaf_tables,
@@ -1871,10 +2169,31 @@ JOIN::exec()
 			    send_row_on_empty_set(),
 			    select_options,
 			    zero_result_cause,
-			    having);
+			    having ? having : tmp_having);
     DBUG_VOID_RETURN;
   }
 
+  /*
+    Evaluate all constant expressions with subqueries in the ORDER/GROUP clauses
+    to make sure that all subqueries return a single row. The evaluation itself
+    will trigger an error if that is not the case.
+  */
+  if (exec_const_order_group_cond.elements &&
+      !(select_options & SELECT_DESCRIBE))
+  {
+    List_iterator_fast<Item> const_item_it(exec_const_order_group_cond);
+    Item *cur_const_item;
+    while ((cur_const_item= const_item_it++))
+    {
+      cur_const_item->val_str(&cur_const_item->str_value);
+      if (thd->is_error())
+      {
+        error= thd->is_error();
+        DBUG_VOID_RETURN;
+      }
+    }
+  }
+
   if ((this->select_lex->options & OPTION_SCHEMA_TABLE) &&
       get_schema_tables_result(this, PROCESSED_BY_JOIN_EXEC))
     DBUG_VOID_RETURN;
@@ -1898,7 +2217,7 @@ JOIN::exec()
     }
     if (order && 
         (order != group_list || !(select_options & SELECT_BIG_RESULT)) &&
-	(const_tables == tables ||
+	(const_tables == table_count ||
  	 ((simple_order || skip_sort_order) &&
 	  test_if_skip_sort_order(&join_tab[const_tables], order,
 				  select_limit, 0, 
@@ -1909,9 +2228,17 @@ JOIN::exec()
     select_describe(this, need_tmp,
 		    order != 0 && !skip_sort_order,
 		    select_distinct,
-                    !tables ? "No tables used" : NullS);
+                    !table_count ? "No tables used" : NullS);
     DBUG_VOID_RETURN;
   }
+  else
+  {
+    /* it's a const select, materialize it. */
+    select_lex->mark_const_derived(zero_result_cause);
+  }
+
+  if (!initialized && init_execution())
+    DBUG_VOID_RETURN;
 
   JOIN *curr_join= this;
   List<Item> *curr_all_fields= &all_fields;
@@ -1940,10 +2267,10 @@ JOIN::exec()
     curr_tmp_table= exec_tmp_table1;
 
     /* Copy data to the temporary table */
-    thd_proc_info(thd, "Copying to tmp table");
+    thd_proc_info(thd, copy_to_tmp_table);
     DBUG_PRINT("info", ("%s", thd->proc_info));
     if (!curr_join->sort_and_group &&
-        curr_join->const_tables != curr_join->tables)
+        curr_join->const_tables != curr_join->table_count)
     {
       JOIN_TAB *first_tab= curr_join->join_tab + curr_join->const_tables;
       first_tab->sorted= test(first_tab->loosescan_match_tab);
@@ -2119,7 +2446,7 @@ JOIN::exec()
         DBUG_VOID_RETURN;
       curr_join->group_list= 0;
       if (!curr_join->sort_and_group &&
-          curr_join->const_tables != curr_join->tables)
+          curr_join->const_tables != curr_join->table_count)
       {
         JOIN_TAB *first_tab= curr_join->join_tab + curr_join->const_tables;
         first_tab->sorted= test(first_tab->loosescan_match_tab);
@@ -2132,7 +2459,7 @@ JOIN::exec()
 	DBUG_VOID_RETURN;
       }
       end_read_record(&curr_join->join_tab->read_record);
-      curr_join->const_tables= curr_join->tables; // Mark free for cleanup()
+      curr_join->const_tables= curr_join->table_count; // Mark free for cleanup()
       curr_join->join_tab[0].table= 0;           // Table is freed
       
       // No sum funcs anymore
@@ -2252,9 +2579,10 @@ JOIN::exec()
       table_map used_tables= (curr_join->const_table_map |
 			      curr_table->table->map);
 
-      Item* sort_table_cond= make_cond_for_table(curr_join->tmp_having,
+      Item* sort_table_cond= make_cond_for_table(thd, curr_join->tmp_having,
 						 used_tables,
-						 (table_map)0, FALSE);
+						 (table_map)0, MAX_TABLES,
+						 FALSE, FALSE);
       if (sort_table_cond)
       {
 	if (!curr_table->select)
@@ -2268,16 +2596,31 @@ JOIN::exec()
 		new Item_cond_and(curr_table->select->cond,
 				  sort_table_cond)))
 	    DBUG_VOID_RETURN;
-	  curr_table->select->cond->fix_fields(thd, 0);
 	}
+        if (curr_table->pre_idx_push_select_cond)
+	{
+          if (sort_table_cond->type() == Item::COND_ITEM)
+            sort_table_cond= sort_table_cond->copy_andor_structure(thd);           
+          if (!(curr_table->pre_idx_push_select_cond= 
+                new Item_cond_and(curr_table->pre_idx_push_select_cond,
+                                  sort_table_cond)))
+            DBUG_VOID_RETURN;            
+        }
+        if (curr_table->select->cond && !curr_table->select->cond->fixed)
+	  curr_table->select->cond->fix_fields(thd, 0);
+        if (curr_table->pre_idx_push_select_cond &&
+            !curr_table->pre_idx_push_select_cond->fixed)
+          curr_table->pre_idx_push_select_cond->fix_fields(thd, 0);
+          
         curr_table->set_select_cond(curr_table->select->cond, __LINE__);
 	curr_table->select_cond->top_level_item();
 	DBUG_EXECUTE("where",print_where(curr_table->select->cond,
 					 "select and having",
                                          QT_ORDINARY););
-	curr_join->tmp_having= make_cond_for_table(curr_join->tmp_having,
+	curr_join->tmp_having= make_cond_for_table(thd, curr_join->tmp_having,
 						   ~ (table_map) 0,
-						   ~used_tables, FALSE);
+						   ~used_tables, MAX_TABLES,
+						   FALSE, FALSE);
 	DBUG_EXECUTE("where",print_where(curr_join->tmp_having,
                                          "having after sort",
                                          QT_ORDINARY););
@@ -2293,7 +2636,7 @@ JOIN::exec()
 	  WHERE clause for any tables after the sorted one.
 	*/
 	JOIN_TAB *curr_table= &curr_join->join_tab[curr_join->const_tables+1];
-	JOIN_TAB *end_table= &curr_join->join_tab[curr_join->tables];
+	JOIN_TAB *end_table= &curr_join->join_tab[curr_join->top_join_tab_count];
 	for (; curr_table < end_table ; curr_table++)
 	{
 	  /*
@@ -2331,7 +2674,7 @@ JOIN::exec()
                             curr_join->group_list ? TRUE : FALSE))
 	DBUG_VOID_RETURN;
       sortorder= curr_join->sortorder;
-      if (curr_join->const_tables != curr_join->tables &&
+      if (curr_join->const_tables != curr_join->table_count &&
           !curr_join->join_tab[curr_join->const_tables].table->sort.io_cache)
       {
         /*
@@ -2398,9 +2741,11 @@ JOIN::destroy()
   {
     if (join_tab != tmp_join->join_tab)
     {
-      JOIN_TAB *tab, *end;
-      for (tab= join_tab, end= tab+tables ; tab != end ; tab++)
+      for (JOIN_TAB *tab= first_linear_tab(this, WITH_CONST_TABLES); tab; 
+           tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
+      {
 	tab->cleanup();
+      }
     }
     tmp_join->tmp_join= 0;
     /*
@@ -2433,6 +2778,7 @@ JOIN::destroy()
 
 void JOIN::cleanup_item_list(List<Item> &items) const
 {
+  DBUG_ENTER("JOIN::cleanup_item_list");
   if (!items.is_empty())
   {
     List_iterator_fast<Item> it(items);
@@ -2440,6 +2786,7 @@ void JOIN::cleanup_item_list(List<Item> &items) const
     while ((item= it++))
       item->cleanup();
   }
+  DBUG_VOID_RETURN;
 }
 
 
@@ -2593,51 +2940,6 @@ err:
 }
 
 
-/**
-  Setup for execution all subqueries of a query, for which the optimizer
-  chose hash semi-join.
-
-  @details Iterate over all subqueries of the query, and if they are under an
-  IN predicate, and the optimizer chose to compute it via hash semi-join:
-  - try to initialize all data structures needed for the materialized execution
-    of the IN predicate,
-  - if this fails, then perform the IN=>EXISTS transformation which was
-    previously blocked during JOIN::prepare.
-
-  This method is part of the "code generation" query processing phase.
-
-  This phase must be called after substitute_for_best_equal_field() because
-  that function may replace items with other items from a multiple equality,
-  and we need to reference the correct items in the index access method of the
-  IN predicate.
-
-  @return Operation status
-  @retval FALSE     success.
-  @retval TRUE      error occurred.
-*/
-
-bool JOIN::setup_subquery_materialization()
-{
-  for (SELECT_LEX_UNIT *un= select_lex->first_inner_unit(); un;
-       un= un->next_unit())
-  {
-    for (SELECT_LEX *sl= un->first_select(); sl; sl= sl->next_select())
-    {
-      Item_subselect *subquery_predicate= sl->master_unit()->item;
-      if (subquery_predicate &&
-          subquery_predicate->substype() == Item_subselect::IN_SUBS)
-      {
-        Item_in_subselect *in_subs= (Item_in_subselect*) subquery_predicate;
-        if (in_subs->exec_method == Item_in_subselect::MATERIALIZATION &&
-            in_subs->setup_engine())
-          return TRUE;
-      }
-    }
-  }
-  return FALSE;
-}
-
-
 /*****************************************************************************
   Create JOIN_TABS, make a guess about the table types,
   Approximate how many records will be used in each table
@@ -2694,12 +2996,11 @@ typedef struct st_sargable_param
 */
 
 static bool
-make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
-		     DYNAMIC_ARRAY *keyuse_array)
+make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
+                     COND *conds, DYNAMIC_ARRAY *keyuse_array)
 {
-  int error;
+  int error= 0;
   TABLE *table;
-  TABLE_LIST *tables= tables_arg;
   uint i,table_count,const_count,key;
   table_map found_const_table_map, all_table_map, found_ref, refs;
   key_map const_ref, eq_part;
@@ -2710,10 +3011,14 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
   table_map no_rows_const_tables= 0;
   SARGABLE_PARAM *sargables= 0;
   JOIN_TAB *stat_vector[MAX_TABLES+1];
+  List_iterator<TABLE_LIST> ti(tables_list);
+  TABLE_LIST *tables;
   DBUG_ENTER("make_join_statistics");
 
-  table_count=join->tables;
-  stat=(JOIN_TAB*) join->thd->calloc(sizeof(JOIN_TAB)*table_count);
+  LINT_INIT(table); /* inited in all loops */
+  table_count=join->table_count;
+
+  stat=(JOIN_TAB*) join->thd->calloc(sizeof(JOIN_TAB)*(table_count));
   stat_ref=(JOIN_TAB**) join->thd->alloc(sizeof(JOIN_TAB*)*MAX_TABLES);
   table_vector=(TABLE**) join->thd->alloc(sizeof(TABLE*)*(table_count*2));
   if (!stat || !stat_ref || !table_vector)
@@ -2725,9 +3030,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
   found_const_table_map= all_table_map=0;
   const_count=0;
 
-  for (s= stat, i= 0;
-       tables;
-       s++, tables= tables->next_leaf, i++)
+  for (s= stat, i= 0; (tables= ti++); s++, i++)
   {
     TABLE_LIST *embedding= tables->embedding;
     stat_vector[i]=s;
@@ -2737,22 +3040,22 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
     s->needed_reg.init();
     table_vector[i]=s->table=table=tables->table;
     table->pos_in_table_list= tables;
-    error= table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
+    error= tables->fetch_number_of_rows();
     if (error)
     {
       table->file->print_error(error, MYF(0));
       goto error;
     }
     table->quick_keys.clear_all();
+    table->intersect_keys.clear_all();
     table->reginfo.join_tab=s;
     table->reginfo.not_exists_optimize=0;
     bzero((char*) table->const_key_parts, sizeof(key_part_map)*table->s->keys);
     all_table_map|= table->map;
+    s->preread_init_done= FALSE;
     s->join=join;
-    s->info=0;					// For describe
 
     s->dependent= tables->dep_tables;
-    s->key_dependent= 0;
     if (tables->schema_table)
       table->file->stats.records= 2;
     table->quick_condition_rows= table->file->stats.records;
@@ -2762,9 +3065,11 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
     {
       /* s is the only inner table of an outer join */
 #ifdef WITH_PARTITION_STORAGE_ENGINE
-      if ((!table->file->stats.records || table->no_partitions_used) && !embedding)
+      if (!table->is_filled_at_execution() &&
+           (!table->file->stats.records || table->no_partitions_used) && !embedding)
 #else
-      if (!table->file->stats.records && !embedding)
+      if (!table->is_filled_at_execution() &&
+          !table->file->stats.records && !embedding)
 #endif
       {						// Empty table
         s->dependent= 0;                        // Ignore LEFT JOIN depend.
@@ -2778,12 +3083,23 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
         s->embedding_map|= embedding->nested_join->nj_map;
       continue;
     }
-    if (embedding && !(embedding->sj_on_expr && ! embedding->embedding))
+    if (embedding)
     {
       /* s belongs to a nested join, maybe to several embedded joins */
       s->embedding_map= 0;
+      bool inside_an_outer_join= FALSE;
       do
       {
+        /* 
+          If this is a semi-join nest, skip it, and proceed upwards. Maybe
+          we're in some outer join nest
+        */
+        if (embedding->sj_on_expr)
+        {
+          embedding= embedding->embedding;
+          continue;
+        }
+        inside_an_outer_join= TRUE;
         NESTED_JOIN *nested_join= embedding->nested_join;
         s->embedding_map|=nested_join->nj_map;
         s->dependent|= embedding->dep_tables;
@@ -2791,14 +3107,16 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
         outer_join|= nested_join->used_tables;
       }
       while (embedding);
-      continue;
+      if (inside_an_outer_join)
+        continue;
     }
 #ifdef WITH_PARTITION_STORAGE_ENGINE
     const bool no_partitions_used= table->no_partitions_used;
 #else
     const bool no_partitions_used= FALSE;
 #endif
-    if ((table->s->system || table->file->stats.records <= 1 ||
+    if (!table->is_filled_at_execution() && 
+        (table->s->system || table->file->stats.records <= 1 ||
          no_partitions_used) &&
 	!s->dependent &&
 	(table->file->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT) &&
@@ -2808,6 +3126,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       no_rows_const_tables |= table->map;
     }
   }
+
   stat_vector[i]=0;
   join->outer_join=outer_join;
 
@@ -2852,7 +3171,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
     {
       if (s->dependent & s->table->map)
       {
-        join->tables=0;			// Don't use join->table
+        join->table_count=0;			// Don't use join->table
         my_message(ER_WRONG_OUTER_JOIN, ER(ER_WRONG_OUTER_JOIN), MYF(0));
         goto error;
       }
@@ -2861,10 +3180,24 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
   }
 
   if (conds || outer_join)
-    if (update_ref_and_keys(join->thd, keyuse_array, stat, join->tables,
-                            conds, join->cond_equal,
-                            ~outer_join, join->select_lex, &sargables))
+  {
+    if (update_ref_and_keys(join->thd, keyuse_array, stat, join->table_count,
+                            conds, ~outer_join, join->select_lex, &sargables))
+      goto error;
+    /*
+      Keyparts without prefixes may be useful if this JOIN is a subquery, and
+      if the subquery may be executed via the IN-EXISTS strategy.
+    */
+    bool skip_unprefixed_keyparts=
+      !(join->is_in_subquery() &&
+        ((Item_in_subselect*)join->unit->item)->in_strategy & SUBS_IN_TO_EXISTS);
+
+    if (keyuse_array->elements &&
+        sort_and_filter_keyuse(join->thd, keyuse_array,
+                               skip_unprefixed_keyparts))
       goto error;
+    DBUG_EXECUTE("opt", print_keyuse_array(keyuse_array););
+  }
 
   join->const_table_map= no_rows_const_tables;
   join->const_tables= const_count;
@@ -2914,6 +3247,9 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
     {
       table=s->table;
 
+      if (table->is_filled_at_execution())
+        continue;
+
       /* 
         If equi-join condition by a key is null rejecting and after a
         substitution of a const table the key value happens to be null
@@ -2934,7 +3270,8 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	*/              
         while (keyuse->table == table)
         {
-          if (!(keyuse->val->used_tables() & ~join->const_table_map) &&
+          if (!keyuse->is_for_hash_join() && 
+              !(keyuse->val->used_tables() & ~join->const_table_map) &&
               keyuse->val->is_null() && keyuse->null_rejecting)
           {
             s->type= JT_CONST;
@@ -2955,7 +3292,9 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	  continue;
 	if (table->file->stats.records <= 1L &&
 	    (table->file->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT) &&
-            !table->pos_in_table_list->embedding)
+            !table->pos_in_table_list->embedding &&
+	      !((outer_join & table->map) && 
+		(*s->on_expr_ref)->is_expensive()))
 	{					// system table
 	  int tmp= 0;
 	  s->type=JT_SYSTEM;
@@ -2977,9 +3316,14 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	s->type= JT_REF;
 	while (keyuse->table == table)
 	{
+          if (keyuse->is_for_hash_join())
+	  {
+            keyuse++;
+            continue;
+          }
 	  start_keyuse=keyuse;
 	  key=keyuse->key;
-	  s->keys.set_bit(key);               // QQ: remove this ?
+	  s->keys.set_bit(key);               // TODO: remove this ?
 
 	  refs=0;
           const_ref.clear_all();
@@ -3010,7 +3354,9 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 	  {
             if (table->key_info[key].flags & HA_NOSAME)
             {
-	      if (const_ref == eq_part)
+	      if (const_ref == eq_part &&
+                  !((outer_join & table->map) &&
+                    (*s->on_expr_ref)->is_expensive()))
 	      {					// Found everything for ref.
 	        int tmp;
 	        ref_changed = 1;
@@ -3065,15 +3411,27 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 
   for (s=stat ; s < stat_end ; s++)
   {
+    s->startup_cost= 0;
     if (s->type == JT_SYSTEM || s->type == JT_CONST)
     {
       /* Only one matching row */
-      s->found_records=s->records=s->read_time=1; s->worst_seeks=1.0;
+      s->found_records= s->records= 1;
+      s->read_time=1.0; 
+      s->worst_seeks=1.0;
       continue;
     }
     /* Approximate found rows and time to read them */
-    s->found_records=s->records=s->table->file->stats.records;
-    s->read_time=(ha_rows) s->table->file->scan_time();
+    if (s->table->is_filled_at_execution())
+    {
+      get_delayed_table_estimates(s->table, &s->records, &s->read_time,
+                                  &s->startup_cost);
+      s->found_records= s->records;
+      table->quick_condition_rows=s->records;
+    }
+    else
+    {
+       s->scan_time();
+    }
 
     /*
       Set a max range of how many seeks we can expect when using keys
@@ -3096,10 +3454,11 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       Don't do range analysis if we're on the inner side of an outer join (2).
       Do range analysis if we're on the inner side of a semi-join (3).
     */
-    if (!s->const_keys.is_clear_all() &&                        // (1)
-        (!s->table->pos_in_table_list->embedding ||             // (2)
-         (s->table->pos_in_table_list->embedding &&             // (3)
-          s->table->pos_in_table_list->embedding->sj_on_expr))) // (3)
+    if (!s->const_keys.is_clear_all() &&                          // (1)
+        (!s->table->pos_in_table_list->embedding ||               // (2)
+         (s->table->pos_in_table_list->embedding &&               // (3)
+          s->table->pos_in_table_list->embedding->sj_on_expr)) && // (3)
+        !s->table->is_filled_at_execution())
     {
       ha_rows records;
       SQL_SELECT *select;
@@ -3137,7 +3496,7 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
       if (records != HA_POS_ERROR)
       {
 	s->found_records=records;
-	s->read_time= (ha_rows) (s->quick ? s->quick->read_time : 0.0);
+	s->read_time= s->quick ? s->quick->read_time : 0.0;
       }
       delete select;
     }
@@ -3148,28 +3507,51 @@ make_join_statistics(JOIN *join, TABLE_LIST *tables_arg, COND *conds,
 
   join->join_tab=stat;
   join->map2table=stat_ref;
-  join->all_tables= table_vector;
+  join->table= table_vector;
   join->const_tables=const_count;
   join->found_const_table_map=found_const_table_map;
 
-  if (join->const_tables != join->tables)
+  if (join->const_tables != join->table_count)
     optimize_keyuse(join, keyuse_array);
    
   if (optimize_semijoin_nests(join, all_table_map))
     DBUG_RETURN(TRUE); /* purecov: inspected */
 
-  /* Find an optimal join order of the non-constant tables. */
-  if (join->const_tables != join->tables)
-  {
-    if (choose_plan(join, all_table_map & ~join->const_table_map))
-      goto error;
-  }
-  else
   {
-    memcpy((uchar*) join->best_positions,(uchar*) join->positions,
-	   sizeof(POSITION)*join->const_tables);
-    join->best_read=1.0;
+    ha_rows records= 1;
+    SELECT_LEX_UNIT *unit= join->select_lex->master_unit();
+
+    /* Find an optimal join order of the non-constant tables. */
+    if (join->const_tables != join->table_count)
+    {
+      if (choose_plan(join, all_table_map & ~join->const_table_map))
+        goto error;
+    }
+    else
+    {
+      memcpy((uchar*) join->best_positions,(uchar*) join->positions,
+	     sizeof(POSITION)*join->const_tables);
+      join->record_count= 1.0;
+      join->best_read=1.0;
+    }
+  
+    if (!(join->select_options & SELECT_DESCRIBE) &&
+        unit->derived && unit->derived->is_materialized_derived())
+    {
+      /*
+        Calculate estimated number of rows for materialized derived
+        table/view.
+      */
+      for (i= 0; i < join->table_count ; i++)
+        records*= join->best_positions[i].records_read ?
+                  (ha_rows)join->best_positions[i].records_read : 1;
+      join->select_lex->increase_derived_records(records);
+    }
   }
+
+  if (join->choose_subquery_plan(all_table_map & ~join->const_table_map))
+    goto error;
+
   /* Generate an execution plan from the found optimal join order. */
   DBUG_RETURN(join->thd->killed || get_best_combination(join));
 
@@ -3180,8 +3562,12 @@ error:
     may not be assigned yet by this function (which is building join_tab).
     Dangling TABLE::reginfo.join_tab may cause part_of_refkey to choke. 
   */
-  for (tables= tables_arg; tables; tables= tables->next_leaf)
-    tables->table->reginfo.join_tab= NULL;
+  {    
+    TABLE_LIST *table;
+    List_iterator<TABLE_LIST> ti(tables_list);
+    while ((table= ti++))
+      table->table->reginfo.join_tab= NULL;
+  }
   DBUG_RETURN (1);
 }
 
@@ -3428,6 +3814,7 @@ static uint get_semi_join_select_list_index(Field *field)
     @param field           Field used in comparision
     @param eq_func         True if we used =, <=> or IS NULL
     @param value           Value used for comparison with field
+    @param num_values      Number of values[] that we are comparing against
     @param usable_tables   Tables which can be used for key optimization
     @param sargables       IN/OUT Array of found sargable candidates
 
@@ -3440,21 +3827,30 @@ static uint get_semi_join_select_list_index(Field *field)
 */
 
 static void
-add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
+add_key_field(JOIN *join,
+              KEY_FIELD **key_fields,uint and_level, Item_func *cond,
               Field *field, bool eq_func, Item **value, uint num_values,
               table_map usable_tables, SARGABLE_PARAM **sargables)
 {
-  uint exists_optimize= 0;
-  if (!(field->flags & PART_KEY_FLAG))
+  uint optimize= 0;  
+  if (eq_func &&
+      ((join->is_allowed_hash_join_access() &&
+        field->hash_join_is_possible()) ||
+       (field->table->pos_in_table_list->is_materialized_derived() &&
+        !field->table->created)))
+  {
+    optimize= KEY_OPTIMIZE_EQ;
+  }   
+  else if (!(field->flags & PART_KEY_FLAG))
   {
     // Don't remove column IS NULL on a LEFT JOIN table
     if (!eq_func || (*value)->type() != Item::NULL_ITEM ||
         !field->table->maybe_null || field->null_ptr)
       return;					// Not a key. Skip it
-    exists_optimize= KEY_OPTIMIZE_EXISTS;
+    optimize= KEY_OPTIMIZE_EXISTS;
     DBUG_ASSERT(num_values == 1);
   }
-  else
+  if (optimize != KEY_OPTIMIZE_EXISTS)
   {
     table_map used_tables=0;
     bool optimizable=0;
@@ -3471,12 +3867,12 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
       if (!eq_func || (*value)->type() != Item::NULL_ITEM ||
           !field->table->maybe_null || field->null_ptr)
 	return;					// Can't use left join optimize
-      exists_optimize= KEY_OPTIMIZE_EXISTS;
+      optimize= KEY_OPTIMIZE_EXISTS;
     }
     else
     {
       JOIN_TAB *stat=field->table->reginfo.join_tab;
-      key_map possible_keys=field->key_start;
+      key_map possible_keys=field->get_possible_keys();
       possible_keys.intersect(field->table->keys_in_use_for_query);
       stat[0].keys.merge(possible_keys);             // Add possible keys
 
@@ -3491,7 +3887,8 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
          Field BETWEEN ...
          Field IN ...
       */
-      stat[0].key_dependent|=used_tables;
+      if (field->flags & PART_KEY_FLAG)
+        stat[0].key_dependent|=used_tables;
 
       bool is_const=1;
       for (uint i=0; i<num_values; i++)
@@ -3514,30 +3911,21 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
         (*sargables)->arg_value= value;
         (*sargables)->num_values= num_values;
       }
+      if (!eq_func) // eq_func is NEVER true when num_values > 1
+        return;
+
       /*
-	We can't always use indexes when comparing a string index to a
-	number. cmp_type() is checked to allow compare of dates to numbers.
-        eq_func is NEVER true when num_values > 1
+	We can't use indexes when comparing a string index to a
+	number or two strings if the effective collation
+        of the operation differ from the field collation.
        */
-      if (!eq_func)
-        return;
-      if (field->result_type() == STRING_RESULT)
+
+      if (field->cmp_type() == STRING_RESULT)
       {
-        if ((*value)->result_type() != STRING_RESULT)
-        {
-          if (field->cmp_type() != (*value)->result_type())
-            return;
-        }
-        else
-        {
-          /*
-            We can't use indexes if the effective collation
-            of the operation differ from the field collation.
-          */
-          if (field->cmp_type() == STRING_RESULT &&
-              ((Field_str*)field)->charset() != cond->compare_collation())
+        if ((*value)->cmp_type() != STRING_RESULT)
             return;
-        }
+        if (((Field_str*)field)->charset() != cond->compare_collation())
+          return;
       }
     }
   }
@@ -3550,8 +3938,8 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
   (*key_fields)->field=		field;
   (*key_fields)->eq_func=	eq_func;
   (*key_fields)->val=		*value;
-  (*key_fields)->level=		and_level;
-  (*key_fields)->optimize=	exists_optimize;
+  (*key_fields)->level=         and_level;
+  (*key_fields)->optimize=      optimize;
   /*
     If the condition has form "tbl.keypart = othertbl.field" and 
     othertbl.field can be NULL, there will be no matches if othertbl.field 
@@ -3598,29 +3986,29 @@ add_key_field(KEY_FIELD **key_fields,uint and_level, Item_func *cond,
 */
 
 static void
-add_key_equal_fields(KEY_FIELD **key_fields, uint and_level,
-                     Item_func *cond, Item_field *field_item,
+add_key_equal_fields(JOIN *join, KEY_FIELD **key_fields, uint and_level,
+                     Item_func *cond, Item *field_item,
                      bool eq_func, Item **val,
                      uint num_values, table_map usable_tables,
                      SARGABLE_PARAM **sargables)
 {
-  Field *field= field_item->field;
-  add_key_field(key_fields, and_level, cond, field,
+  Field *field= ((Item_field *) (field_item->real_item()))->field;
+  add_key_field(join, key_fields, and_level, cond, field,
                 eq_func, val, num_values, usable_tables, sargables);
-  Item_equal *item_equal= field_item->item_equal;
+  Item_equal *item_equal= field_item->get_item_equal();
   if (item_equal)
   { 
     /*
       Add to the set of possible key values every substitution of
       the field for an equal field included into item_equal
     */
-    Item_equal_iterator it(*item_equal);
-    Item_field *item;
-    while ((item= it++))
+    Item_equal_fields_iterator it(*item_equal);
+    while (it++)
     {
-      if (!field->eq(item->field))
+      Field *equal_field= it.get_curr_field();
+      if (!field->eq(equal_field))
       {
-        add_key_field(key_fields, and_level, cond, item->field,
+        add_key_field(join, key_fields, and_level, cond, equal_field,
                       eq_func, val, num_values, usable_tables,
                       sargables);
       }
@@ -3646,7 +4034,7 @@ is_local_field (Item *field)
 {
   return field->real_item()->type() == Item::FIELD_ITEM
      && !(field->used_tables() & OUTER_REF_TABLE_BIT)
-     && !((Item_field *)field->real_item())->depended_from;
+    && !((Item_field *)field->real_item())->get_depended_from();
 }
 
 
@@ -3666,7 +4054,6 @@ is_local_field (Item *field)
   The primary reason for having and_level attribute is the OR operation which 
   uses and_level to mark KEY_FIELDs that should get into the result of the OR
   operation
-
 */
 
 static void
@@ -3775,10 +4162,10 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         condition is of the form::
         '<field> BETWEEN value[1] AND value[2]'
       */
-      if (is_local_field (values[0]))
+      if (is_local_field(values[0]))
       {
         field_item= (Item_field *) (values[0]->real_item());
-        add_key_equal_fields(key_fields, *and_level, cond_func,
+        add_key_equal_fields(join, key_fields, *and_level, cond_func,
                              field_item, equal_func, &values[1],
                              num_values, usable_tables, sargables);
       }
@@ -3789,10 +4176,10 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
       */
       for (uint i= 1; i <= num_values; i++)
       {
-        if (is_local_field (values[i]))
+        if (is_local_field(values[i]))
         {
           field_item= (Item_field *) (values[i]->real_item());
-          add_key_equal_fields(key_fields, *and_level, cond_func,
+          add_key_equal_fields(join, key_fields, *and_level, cond_func,
                                field_item, equal_func, values,
                                1, usable_tables, sargables);
         }
@@ -3809,7 +4196,7 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         values--;
       DBUG_ASSERT(cond_func->functype() != Item_func::IN_FUNC ||
                   cond_func->argument_count() != 2);
-      add_key_equal_fields(key_fields, *and_level, cond_func,
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
                            (Item_field*) (cond_func->key_item()->real_item()),
                            0, values, 
                            cond_func->argument_count()-1,
@@ -3824,8 +4211,9 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
 
     if (is_local_field (cond_func->arguments()[0]))
     {
-      add_key_equal_fields(key_fields, *and_level, cond_func,
-	                (Item_field*) (cond_func->arguments()[0])->real_item(),
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
+                           (Item_field*) (cond_func->arguments()[0])->
+                           real_item(),
 		           equal_func,
                            cond_func->arguments()+1, 1, usable_tables,
                            sargables);
@@ -3833,8 +4221,9 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
     if (is_local_field (cond_func->arguments()[1]) &&
 	cond_func->functype() != Item_func::LIKE_FUNC)
     {
-      add_key_equal_fields(key_fields, *and_level, cond_func, 
-                       (Item_field*) (cond_func->arguments()[1])->real_item(),
+      add_key_equal_fields(join, key_fields, *and_level, cond_func, 
+                           (Item_field*) (cond_func->arguments()[1])->
+                           real_item(),
 		           equal_func,
                            cond_func->arguments(),1,usable_tables,
                            sargables);
@@ -3849,17 +4238,17 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
       Item *tmp=new Item_null;
       if (unlikely(!tmp))                       // Should never be true
 	return;
-      add_key_equal_fields(key_fields, *and_level, cond_func,
-		    (Item_field*) (cond_func->arguments()[0])->real_item(),
-		    cond_func->functype() == Item_func::ISNULL_FUNC,
+      add_key_equal_fields(join, key_fields, *and_level, cond_func,
+                           (Item_field*) (cond_func->arguments()[0])->
+                           real_item(),
+                           cond_func->functype() == Item_func::ISNULL_FUNC,
 			   &tmp, 1, usable_tables, sargables);
     }
     break;
   case Item_func::OPTIMIZE_EQUAL:
     Item_equal *item_equal= (Item_equal *) cond;
     Item *const_item= item_equal->get_const();
-    Item_equal_iterator it(*item_equal);
-    Item_field *item;
+    Item_equal_fields_iterator it(*item_equal);
     if (const_item)
     {
       /*
@@ -3867,9 +4256,10 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         field1=const_item as a condition allowing an index access of the table
         with field1 by the keys value of field1.
       */   
-      while ((item= it++))
+      while (it++)
       {
-        add_key_field(key_fields, *and_level, cond_func, item->field,
+        Field *equal_field= it.get_curr_field();
+        add_key_field(join, key_fields, *and_level, cond_func, equal_field,
                       TRUE, &const_item, 1, usable_tables, sargables);
       }
     }
@@ -3881,17 +4271,18 @@ add_key_fields(JOIN *join, KEY_FIELD **key_fields, uint *and_level,
         field1=field2 as a condition allowing an index access of the table
         with field1 by the keys value of field2.
       */   
-      Item_equal_iterator fi(*item_equal);
-      while ((item= fi++))
+      Item_equal_fields_iterator fi(*item_equal);
+      while (fi++)
       {
-        Field *field= item->field;
+        Field *field= fi.get_curr_field();
+        Item *item;
         while ((item= it++))
         {
-          if (!field->eq(item->field))
+          Field *equal_field= it.get_curr_field();
+          if (!field->eq(equal_field))
           {
-            Item *tmp_item= item;
-            add_key_field(key_fields, *and_level, cond_func, field,
-                          TRUE, &tmp_item, 1, usable_tables,
+            add_key_field(join, key_fields, *and_level, cond_func, field,
+                          TRUE, &item, 1, usable_tables,
                           sargables);
           }
         }
@@ -3911,6 +4302,55 @@ max_part_bit(key_part_map bits)
   return found;
 }
 
+
+/**
+  Add a new keuse to the specified array of KEYUSE objects
+
+  @param[in,out]  keyuse_array  array of keyuses to be extended 
+  @param[in]      key_field     info on the key use occurrence
+  @param[in]      key           key number for the keyse to be added
+  @param[in]      part          key part for the keyuse to be added
+
+  @note
+  The function builds a new KEYUSE object for a key use utilizing the info
+  on the left and right parts of the given key use  extracted from the 
+  structure key_field, the key number and key part for this key use. 
+  The built object is added to the dynamic array keyuse_array.
+
+  @retval         0             the built object is succesfully added 
+  @retval         1             otherwise
+*/
+
+static bool
+add_keyuse(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field,
+          uint key, uint part)
+{
+  KEYUSE keyuse;
+  Field *field= key_field->field;
+
+  keyuse.table= field->table;
+  keyuse.val= key_field->val;
+  keyuse.key= key;
+  if (!is_hash_join_key_no(key))
+  {
+    keyuse.keypart=part;
+    keyuse.keypart_map= (key_part_map) 1 << part;
+  }
+  else
+  {
+    keyuse.keypart= field->field_index;
+    keyuse.keypart_map= (key_part_map) 0;
+  }
+  keyuse.used_tables= key_field->val->used_tables();
+  keyuse.optimize= key_field->optimize & KEY_OPTIMIZE_REF_OR_NULL;
+  keyuse.ref_table_rows= 0;
+  keyuse.null_rejecting= key_field->null_rejecting;
+  keyuse.cond_guard= key_field->cond_guard;
+  keyuse.sj_pred_no= key_field->sj_pred_no;
+  return (insert_dynamic(keyuse_array,(uchar*) &keyuse));
+}
+
+
 /*
   Add all keys with uses 'field' for some keypart
   If field->and_level != and_level then only mark key_part as const_part
@@ -3921,11 +4361,10 @@ max_part_bit(key_part_map bits)
 */
 
 static bool
-add_key_part(DYNAMIC_ARRAY *keyuse_array,KEY_FIELD *key_field)
+add_key_part(DYNAMIC_ARRAY *keyuse_array, KEY_FIELD *key_field)
 {
   Field *field=key_field->field;
   TABLE *form= field->table;
-  KEYUSE keyuse;
 
   if (key_field->eq_func && !(key_field->optimize & KEY_OPTIMIZE_EXISTS))
   {
@@ -3941,21 +4380,24 @@ add_key_part(DYNAMIC_ARRAY *keyuse_array,KEY_FIELD *key_field)
       {
 	if (field->eq(form->key_info[key].key_part[part].field))
 	{
-	  keyuse.table= field->table;
-	  keyuse.val =  key_field->val;
-	  keyuse.key =  key;
-	  keyuse.keypart=part;
-	  keyuse.keypart_map= (key_part_map) 1 << part;
-	  keyuse.used_tables=key_field->val->used_tables();
-	  keyuse.optimize= key_field->optimize & KEY_OPTIMIZE_REF_OR_NULL;
-          keyuse.null_rejecting= key_field->null_rejecting;
-          keyuse.cond_guard= key_field->cond_guard;
-          keyuse.sj_pred_no= key_field->sj_pred_no;
-	  if (insert_dynamic(keyuse_array,(uchar*) &keyuse))
+          if (add_keyuse(keyuse_array, key_field, key, part))
             return TRUE;
 	}
       }
     }
+    if (field->hash_join_is_possible() &&
+        (key_field->optimize & KEY_OPTIMIZE_EQ) &&
+        key_field->val->used_tables())
+    {
+      /* 
+        If a key use is extracted from an equi-join predicate then it is
+        added not only as a key use for every index whose component can
+        be evalusted utilizing this key use, but also as a key use for
+        hash join. Such key uses are marked with a special key number. 
+      */    
+      if (add_keyuse(keyuse_array, key_field, get_hash_join_key_no(), 0))
+        return TRUE;
+    }
   }
   return FALSE;
 }
@@ -4036,6 +4478,9 @@ sort_keyuse(KEYUSE *a,KEYUSE *b)
     return (int) (a->table->tablenr - b->table->tablenr);
   if (a->key != b->key)
     return (int) (a->key - b->key);
+  if (a->key == MAX_KEY && b->key == MAX_KEY && 
+      a->used_tables != b->used_tables)
+    return (int) ((ulong) a->used_tables - (ulong) b->used_tables);
   if (a->keypart != b->keypart)
     return (int) (a->keypart - b->keypart);
   // Place const values before other ones
@@ -4142,11 +4587,10 @@ static void add_key_fields_for_nj(JOIN *join, TABLE_LIST *nested_join_table,
 
 static bool
 update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,JOIN_TAB *join_tab,
-                    uint tables, COND *cond, COND_EQUAL *cond_equal,
-                    table_map normal_tables, SELECT_LEX *select_lex,
-                    SARGABLE_PARAM **sargables)
+                    uint tables, COND *cond, table_map normal_tables,
+                    SELECT_LEX *select_lex, SARGABLE_PARAM **sargables)
 {
-  uint	and_level,i,found_eq_constant;
+  uint	and_level,i;
   KEY_FIELD *key_fields, *end, *field;
   uint sz;
   uint m= max(select_lex->max_equal_elems,1);
@@ -4186,19 +4630,21 @@ update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,JOIN_TAB *join_tab,
 
   if (my_init_dynamic_array(keyuse,sizeof(KEYUSE),20,64))
     return TRUE;
+
   if (cond)
   {
+    KEY_FIELD *saved_field= field;
     add_key_fields(join_tab->join, &end, &and_level, cond, normal_tables,
                    sargables);
     for (; field != end ; field++)
     {
-      if (add_key_part(keyuse,field))
-        return TRUE;
+
       /* Mark that we can optimize LEFT JOIN */
       if (field->val->type() == Item::NULL_ITEM &&
 	  !field->field->real_maybe_null())
 	field->field->table->reginfo.not_exists_optimize=1;
     }
+    field= saved_field;
   }
   for (i=0 ; i < tables ; i++)
   {
@@ -4242,71 +4688,85 @@ update_ref_and_keys(THD *thd, DYNAMIC_ARRAY *keyuse,JOIN_TAB *join_tab,
       return TRUE;
   }
 
-  /*
-    Sort the array of possible keys and remove the following key parts:
-    - ref if there is a keypart which is a ref and a const.
-      (e.g. if there is a key(a,b) and the clause is a=3 and b=7 and b=t2.d,
-      then we skip the key part corresponding to b=t2.d)
-    - keyparts without previous keyparts
-      (e.g. if there is a key(a,b,c) but only b < 5 (or a=2 and c < 3) is
-      used in the query, we drop the partial key parts from consideration).
-    Special treatment for ft-keys.
-  */
-  if (keyuse->elements)
-  {
-    KEYUSE key_end,*prev,*save_pos,*use;
+  return FALSE;
+}
 
-    my_qsort(keyuse->buffer,keyuse->elements,sizeof(KEYUSE),
-	  (qsort_cmp) sort_keyuse);
 
-    bzero((char*) &key_end,sizeof(key_end));    /* Add for easy testing */
-    if (insert_dynamic(keyuse,(uchar*) &key_end))
-      return TRUE;
+/**
+  Sort the array of possible keys and remove the following key parts:
+  - ref if there is a keypart which is a ref and a const.
+    (e.g. if there is a key(a,b) and the clause is a=3 and b=7 and b=t2.d,
+    then we skip the key part corresponding to b=t2.d)
+  - keyparts without previous keyparts
+    (e.g. if there is a key(a,b,c) but only b < 5 (or a=2 and c < 3) is
+    used in the query, we drop the partial key parts from consideration).
+  Special treatment for ft-keys.
+*/
+
+static bool sort_and_filter_keyuse(THD *thd, DYNAMIC_ARRAY *keyuse, 
+                                   bool skip_unprefixed_keyparts)
+{
+  KEYUSE key_end, *prev, *save_pos, *use;
+  uint found_eq_constant, i;
+
+  DBUG_ASSERT(keyuse->elements);
+
+  my_qsort(keyuse->buffer, keyuse->elements, sizeof(KEYUSE),
+           (qsort_cmp) sort_keyuse);
+
+  bzero((char*) &key_end, sizeof(key_end));    /* Add for easy testing */
+  if (insert_dynamic(keyuse, (uchar*) &key_end))
+    return TRUE;
 
-    use=save_pos=dynamic_element(keyuse,0,KEYUSE*);
-    prev= &key_end;
-    found_eq_constant=0;
-    for (i=0 ; i < keyuse->elements-1 ; i++,use++)
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_DERIVED_WITH_KEYS))
+    generate_derived_keys(keyuse);
+
+  use= save_pos= dynamic_element(keyuse,0,KEYUSE*);
+  prev= &key_end;
+  found_eq_constant= 0;
+  for (i=0 ; i < keyuse->elements-1 ; i++,use++)
+  {
+    if (!use->is_for_hash_join())
     {
       if (!use->used_tables && use->optimize != KEY_OPTIMIZE_REF_OR_NULL)
-	use->table->const_key_parts[use->key]|= use->keypart_map;
+        use->table->const_key_parts[use->key]|= use->keypart_map;
       if (use->keypart != FT_KEYPART)
       {
-	if (use->key == prev->key && use->table == prev->table)
-	{
-	  if (prev->keypart+1 < use->keypart ||
-	      (prev->keypart == use->keypart && found_eq_constant))
-	    continue;				/* remove */
-	}
-	else if (use->keypart != 0)		// First found must be 0
-	  continue;
+        if (use->key == prev->key && use->table == prev->table)
+        {
+          if ((prev->keypart+1 < use->keypart && skip_unprefixed_keyparts) ||
+              (prev->keypart == use->keypart && found_eq_constant))
+            continue;				/* remove */
+        }
+        else if (use->keypart != 0 && skip_unprefixed_keyparts)
+          continue; /* remove - first found must be 0 */
       }
 
-#if defined(__GNUC__) && !MY_GNUC_PREREQ(4,4)
-      /*
-        Old gcc used a memcpy(), which is undefined if save_pos==use:
-        http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19410
-        http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39480
-      */
-      if (save_pos != use)
-#endif
-        *save_pos= *use;
-      prev=use;
+      prev= use;
       found_eq_constant= !use->used_tables;
-      /* Save ptr to first use */
-      if (!use->table->reginfo.join_tab->keyuse)
-	use->table->reginfo.join_tab->keyuse=save_pos;
       use->table->reginfo.join_tab->checked_keys.set_bit(use->key);
-      save_pos++;
     }
-    i=(uint) (save_pos-(KEYUSE*) keyuse->buffer);
-    (void) set_dynamic(keyuse,(uchar*) &key_end,i);
-    keyuse->elements=i;
+    /*
+      Old gcc used a memcpy(), which is undefined if save_pos==use:
+      http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19410
+      http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39480
+      This also disables a valgrind warning, so better to have the test.
+    */
+    if (save_pos != use)
+      *save_pos= *use;
+    /* Save ptr to first use */
+    if (!use->table->reginfo.join_tab->keyuse)
+      use->table->reginfo.join_tab->keyuse= save_pos;
+    save_pos++;
   }
-  DBUG_EXECUTE("opt", print_keyuse_array(keyuse););
+  i= (uint) (save_pos-(KEYUSE*) keyuse->buffer);
+  (void) set_dynamic(keyuse,(uchar*) &key_end,i);
+  keyuse->elements= i;
+
   return FALSE;
 }
 
+
 /**
   Update some values in keyuse for faster choose_plan() loop.
 */
@@ -4331,12 +4791,15 @@ static void optimize_keyuse(JOIN *join, DYNAMIC_ARRAY *keyuse_array)
 	(map= (keyuse->used_tables & ~join->const_table_map &
 	       ~OUTER_REF_TABLE_BIT)))
     {
-      uint tablenr;
-      for (tablenr=0 ; ! (map & 1) ; map>>=1, tablenr++) ;
-      if (map == 1)			// Only one table
+      uint n_tables= my_count_bits(map);
+      if (n_tables == 1)			// Only one table
       {
-	TABLE *tmp_table=join->all_tables[tablenr];
-	keyuse->ref_table_rows= max(tmp_table->file->stats.records, 100);
+        Table_map_iterator it(map);
+        int tablenr= it.next_bit();
+        DBUG_ASSERT(tablenr != Table_map_iterator::BITMAP_END);
+	TABLE *tmp_table=join->table[tablenr];
+        if (tmp_table) // already created
+          keyuse->ref_table_rows= max(tmp_table->file->stats.records, 100);
       }
     }
     /*
@@ -4372,7 +4835,7 @@ is_indexed_agg_distinct(JOIN *join, List<Item_field> *out_args)
   Item_sum **sum_item_ptr;
   bool result= false;
 
-  if (join->tables != 1 ||                    /* reference more than 1 table */
+  if (join->table_count != 1 ||                    /* reference more than 1 table */
       join->select_distinct ||                /* or a DISTINCT */
       join->select_lex->olap == ROLLUP_TYPE)  /* Check (B3) for ROLLUP */
     return false;
@@ -4521,6 +4984,35 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
 }
 
 
+/* Estimate of the number matching candidates in the joined table */
+
+inline
+ha_rows matching_candidates_in_table(JOIN_TAB *s, bool with_found_constraint)
+{
+  ha_rows records= s->found_records;
+  /*
+    If there is a filtering condition on the table (i.e. ref analyzer found
+    at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
+    preceding this table in the join order we're now considering), then 
+    assume that 25% of the rows will be filtered out by this condition.
+
+    This heuristic is supposed to force tables used in exprZ to be before
+    this table in join order.
+  */
+  if (with_found_constraint)
+    records-= records/4;
+
+    /*
+      If applicable, get a more accurate estimate. Don't use the two
+      heuristics at once.
+    */
+  if (s->table->quick_condition_rows != s->found_records)
+    records= s->table->quick_condition_rows;
+
+  return records;
+}
+
+
 /**
   Find the best access path for an extension of a partial execution
   plan and add this path to the plan.
@@ -4570,16 +5062,21 @@ best_access_path(JOIN      *join,
   double tmp;
   ha_rows rec;
   bool best_uses_jbuf= FALSE;
+  MY_BITMAP *eq_join_set= &s->table->eq_join_set;
+  KEYUSE *hj_start_key= 0;
 
   Loose_scan_opt loose_scan_opt;
   DBUG_ENTER("best_access_path");
   
+  bitmap_clear_all(eq_join_set);
+
   loose_scan_opt.init(join, s, remaining_tables);
   
   if (s->keyuse)
   {                                            /* Use key if possible */
+    KEYUSE *keyuse;
+    KEYUSE *start_key=0;
     TABLE *table= s->table;
-    KEYUSE *keyuse,*start_key=0;
     double best_records= DBL_MAX;
     uint max_key_part=0;
 
@@ -4587,15 +5084,33 @@ best_access_path(JOIN      *join,
     rec= s->records/MATCHING_ROWS_IN_OTHER_TABLE;  // Assumed records/key
     for (keyuse=s->keyuse ; keyuse->table == table ;)
     {
+      KEY *keyinfo;
       key_part_map found_part= 0;
       table_map found_ref= 0;
       uint key= keyuse->key;
-      KEY *keyinfo= table->key_info+key;
       bool ft_key=  (keyuse->keypart == FT_KEYPART);
       /* Bitmap of keyparts where the ref access is over 'keypart=const': */
       key_part_map const_part= 0;
       /* The or-null keypart in ref-or-null access: */
       key_part_map ref_or_null_part= 0;
+      if (is_hash_join_key_no(key))
+      {
+        /* 
+          Hash join as any join employing join buffer can be used to join
+          only those tables that are joined after the first non const table
+	*/  
+        if (!(remaining_tables & keyuse->used_tables) &&
+            idx > join->const_tables)
+        {
+          if (!hj_start_key)
+            hj_start_key= keyuse;
+          bitmap_set_bit(eq_join_set, keyuse->keypart);
+        }
+        keyuse++;
+        continue;
+      }
+
+      keyinfo= table->key_info+key;
 
       /* Calculate how many key segments of the current key we can use */
       start_key= keyuse;
@@ -4624,8 +5139,8 @@ best_access_path(JOIN      *join,
             if (!(keyuse->used_tables & ~join->const_table_map))
               const_part|= keyuse->keypart_map;
 
-            double tmp2= prev_record_reads(join, idx, (found_ref |
-                                                      keyuse->used_tables));
+            double tmp2= prev_record_reads(join->positions, idx,
+                                           (found_ref | keyuse->used_tables));
             if (tmp2 < best_prev_record_reads)
             {
               best_part_found_ref= keyuse->used_tables & ~join->const_table_map;
@@ -4665,7 +5180,7 @@ best_access_path(JOIN      *join,
           Really, there should be records=0.0 (yes!)
           but 1.0 would be probably safer
         */
-        tmp= prev_record_reads(join, idx, found_ref);
+        tmp= prev_record_reads(join->positions, idx, found_ref);
         records= 1.0;
       }
       else
@@ -4680,7 +5195,7 @@ best_access_path(JOIN      *join,
           max_key_part= (uint) ~0;
           if ((keyinfo->flags & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME)
           {
-            tmp = prev_record_reads(join, idx, found_ref);
+            tmp = prev_record_reads(join->positions, idx, found_ref);
             records=1.0;
           }
           else
@@ -4747,9 +5262,10 @@ best_access_path(JOIN      *join,
             tmp= records;
             set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
             if (table->covering_keys.is_set(key))
-              tmp= table->file->keyread_time(key, 1, tmp);
+              tmp= table->file->keyread_time(key, 1, (ha_rows) tmp);
             else
-              tmp= table->file->read_time(key, 1, min(tmp,s->worst_seeks)-1);
+              tmp= table->file->read_time(key, 1,
+                                          (ha_rows) min(tmp,s->worst_seeks)-1);
             tmp*= record_count;
           }
         }
@@ -4910,16 +5426,18 @@ best_access_path(JOIN      *join,
             /* Limit the number of matched rows */
             set_if_smaller(tmp, (double) thd->variables.max_seeks_for_key);
             if (table->covering_keys.is_set(key))
-              tmp= table->file->keyread_time(key, 1, tmp);
+              tmp= table->file->keyread_time(key, 1, (ha_rows) tmp);
             else
-              tmp= table->file->read_time(key, 1, min(tmp,s->worst_seeks)-1);
+              tmp= table->file->read_time(key, 1,
+                                          (ha_rows) min(tmp,s->worst_seeks)-1);
             tmp*= record_count;
           }
           else
             tmp= best_time;                    // Do nothing
         }
-        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
 
+        tmp += s->startup_cost;
+        loose_scan_opt.check_ref_access_part2(key, start_key, records, tmp);
       } /* not ft_key */
       if (tmp + 0.0001 < best_time - records/(double) TIME_FOR_COMPARE)
       {
@@ -4934,6 +5452,41 @@ best_access_path(JOIN      *join,
     records= best_records;
   }
 
+  /* 
+    If there is no key to access the table, but there is an equi-join
+    predicate connecting the table with the privious tables then we
+    consider the possibility of using hash join.
+    We need also to check that:
+    (1) s is inner table of semi-join -> join cache is allowed for semijoins
+    (2) s is inner table of outer join -> join cache is allowed for outer joins
+  */  
+  if (idx > join->const_tables && best_key == 0 && 
+     !bitmap_is_clear_all(eq_join_set) &&  !disable_jbuf &&
+      (!s->emb_sj_nest ||                     
+       join->allowed_semijoin_with_cache) &&    // (1)
+      (!(s->table->map & join->outer_join) ||
+       join->allowed_outer_join_with_cache))    // (2)
+  {
+    double join_sel= 0.1;
+    /* Estimate the cost of  the hash join access to the table */
+    ha_rows rnd_records= matching_candidates_in_table(s, found_constraint);
+
+    tmp= s->quick ? s->quick->read_time : s->scan_time();
+    tmp+= (s->records - rnd_records)/(double) TIME_FOR_COMPARE;
+
+    /* We read the table as many times as join buffer becomes full. */
+    tmp*= (1.0 + floor((double) cache_record_length(join,idx) *
+                          record_count /
+                          (double) thd->variables.join_buff_size));
+    best_time= tmp + 
+               (record_count*join_sel) / TIME_FOR_COMPARE * rnd_records;
+    best= tmp;
+    records= rows2double(rnd_records);
+    best_key= hj_start_key;
+    best_ref_depends_map= 0;
+    best_uses_jbuf= TRUE;
+   }
+
   /*
     Don't test table scan if it can't be better.
     Prefer key lookup if we would use the same key for scanning.
@@ -4961,33 +5514,21 @@ best_access_path(JOIN      *join,
         Since we have a 'ref' access path, and FORCE INDEX instructs us to
         choose it over ALL/index, there is no need to consider a full table
         scan.
+    (5) Non-flattenable semi-joins: don't consider doing a scan of temporary
+        table if we had an option to make lookups into it. In real-world cases,
+        lookups are cheaper than full scans, but when the table is small, they
+        can be [considered to be] more expensive, which causes lookups not to 
+        be used for cases with small datasets, which is annoying.
   */
   if ((records >= s->found_records || best > s->read_time) &&            // (1)
       !(s->quick && best_key && s->quick->index == best_key->key &&      // (2)
         best_max_key_part >= s->table->quick_key_parts[best_key->key]) &&// (2)
       !((s->table->file->ha_table_flags() & HA_TABLE_SCAN_ON_INDEX) &&   // (3)
         ! s->table->covering_keys.is_clear_all() && best_key && !s->quick) &&// (3)
-      !(s->table->force_index && best_key && !s->quick))                 // (4)
+      !(s->table->force_index && best_key && !s->quick) &&               // (4)
+      !(best_key && s->table->pos_in_table_list->jtbm_subselect))        // (5)
   {                                             // Check full join
-    ha_rows rnd_records= s->found_records;
-    /*
-      If there is a filtering condition on the table (i.e. ref analyzer found
-      at least one "table.keyXpartY= exprZ", where exprZ refers only to tables
-      preceding this table in the join order we're now considering), then 
-      assume that 25% of the rows will be filtered out by this condition.
-
-      This heuristic is supposed to force tables used in exprZ to be before
-      this table in join order.
-    */
-    if (found_constraint)
-      rnd_records-= rnd_records/4;
-
-    /*
-      If applicable, get a more accurate estimate. Don't use the two
-      heuristics at once.
-    */
-    if (s->table->quick_condition_rows != s->found_records)
-      rnd_records= s->table->quick_condition_rows;
+    ha_rows rnd_records= matching_candidates_in_table(s, found_constraint);
 
     /*
       Range optimizer never proposes a RANGE if it isn't better
@@ -5015,7 +5556,7 @@ best_access_path(JOIN      *join,
     else
     {
       /* Estimate cost of reading table. */
-      tmp= s->table->file->scan_time();
+      tmp= s->scan_time();
       if ((s->table->map & join->outer_join) || disable_jbuf)     // Can't use join cache
       {
         /*
@@ -5044,6 +5585,7 @@ best_access_path(JOIN      *join,
       }
     }
 
+    tmp += s->startup_cost;
     /*
       We estimate the cost of evaluating WHERE clause for found records
       as record_count * rnd_records / TIME_FOR_COMPARE. This cost plus
@@ -5051,7 +5593,8 @@ best_access_path(JOIN      *join,
     */
     if (best == DBL_MAX ||
         (tmp  + record_count/(double) TIME_FOR_COMPARE*rnd_records <
-         best + record_count/(double) TIME_FOR_COMPARE*records))
+         (best_key->is_for_hash_join() ? best_time :
+          best + record_count/(double) TIME_FOR_COMPARE*records)))
     {
       /*
         If the table has a range (s->quick is set) make_join_select()
@@ -5066,7 +5609,7 @@ best_access_path(JOIN      *join,
                                                join->outer_join)));
     }
   }
-  
+
   /* Update the cost information for the current partial plan */
   pos->records_read= records;
   pos->read_time=    best;
@@ -5144,7 +5687,7 @@ choose_plan(JOIN *join, table_map join_tables)
     jtab_sort_func= straight_join ? join_tab_cmp_straight : join_tab_cmp;
   }
   my_qsort2(join->best_ref + join->const_tables,
-            join->tables - join->const_tables, sizeof(JOIN_TAB*),
+            join->table_count - join->const_tables, sizeof(JOIN_TAB*),
             jtab_sort_func, (void*)join->emb_sjm_nest);
   join->cur_sj_inner_tables= 0;
 
@@ -5320,7 +5863,7 @@ join_tab_cmp_embedded_first(const void *emb,  const void* ptr1, const void* ptr2
 static uint
 determine_search_depth(JOIN *join)
 {
-  uint table_count=  join->tables - join->const_tables;
+  uint table_count=  join->table_count - join->const_tables;
   uint search_depth;
   /* TODO: this value should be determined dynamically, based on statistics: */
   uint max_tables_for_exhaustive_opt= 7;
@@ -5366,6 +5909,7 @@ optimize_straight_join(JOIN *join, table_map join_tables)
 {
   JOIN_TAB *s;
   uint idx= join->const_tables;
+  bool disable_jbuf= join->thd->variables.join_cache_level == 0;
   double    record_count= 1.0;
   double    read_time=    0.0;
   POSITION  loose_scan_pos;
@@ -5373,7 +5917,7 @@ optimize_straight_join(JOIN *join, table_map join_tables)
   for (JOIN_TAB **pos= join->best_ref + idx ; (s= *pos) ; pos++)
   {
     /* Find the best access method from 's' to the current partial plan */
-    best_access_path(join, s, join_tables, idx, FALSE, record_count,
+    best_access_path(join, s, join_tables, idx, disable_jbuf, record_count,
                      join->positions + idx, &loose_scan_pos);
 
     /* compute the cost of the new plan extended with 's' */
@@ -5392,6 +5936,7 @@ optimize_straight_join(JOIN *join, table_map join_tables)
     read_time+= record_count;  // We have to make a temp table
   memcpy((uchar*) join->best_positions, (uchar*) join->positions,
          sizeof(POSITION)*idx);
+  join->record_count= record_count;
   join->best_read= read_time;
 }
 
@@ -5416,7 +5961,7 @@ optimize_straight_join(JOIN *join, table_map join_tables)
 
     All other cases are in-between these two extremes. Thus the parameter
     'search_depth' controlls the exhaustiveness of the search. The higher the
-    value, the longer the optimizaton time and possibly the better the
+    value, the longer the optimization time and possibly the better the
     resulting plan. The lower the value, the fewer alternative plans are
     estimated, but the more likely to get a bad QEP.
 
@@ -5573,40 +6118,102 @@ greedy_search(JOIN      *join,
 }
 
 
-/*
-  Calculate a cost of given partial join order
+/**
+  Get cost of execution and fanout produced by selected tables in the join
+  prefix (where prefix is defined as prefix in depth-first traversal)
  
-  SYNOPSIS
-    get_partial_join_cost()
-      join               IN    Join to use. join->positions holds the
-                               partial join order
-      idx                IN    # tables in the partial join order
-      read_time_arg      OUT   Store read time here 
-      record_count_arg   OUT   Store record count here
+  @param end_tab_idx               The number of last tab to be taken into
+                                   account (in depth-first traversal prefix)
+  @param filter_map                Bitmap of tables whose cost/fanout are to 
+                                   be taken into account.
+  @param read_time_arg     [out]   store read time here 
+  @param record_count_arg  [out]   store record count here
 
-  DESCRIPTION
-
-    This is needed for semi-join materialization code. The idea is that 
-    we detect sj-materialization after we've put all sj-inner tables into
-    the join prefix
-
-      prefix-tables semi-join-inner-tables  tN
-                                             ^--we're here
+  @note
 
-    and we'll need to get the cost of prefix-tables prefix again.
+  @returns
+    read_time_arg and record_count_arg contain the computed cost and fanout
 */
 
-void get_partial_join_cost(JOIN *join, uint n_tables, double *read_time_arg,
-                           double *record_count_arg)
+void JOIN::get_partial_cost_and_fanout(uint end_tab_idx,
+                                       table_map filter_map,
+                                       double *read_time_arg, 
+                                       double *record_count_arg)
 {
   double record_count= 1;
   double read_time= 0.0;
-  for (uint i= join->const_tables; i < n_tables + join->const_tables ; i++)
+  double sj_inner_fanout= 1.0;
+  JOIN_TAB *end_tab= NULL;
+  JOIN_TAB *tab;
+  uint i;
+  uint last_sj_table= MAX_TABLES;
+
+  /* 
+    Handle a special case where the join is degenerate, and produces no
+    records
+  */
+  if (table_count == 0)
+  {
+    *read_time_arg= 0.0;
+    /*
+      We return 1, because 
+       - it is the pessimistic estimate (there might be grouping)
+       - it's safer, as we're less likely to hit the edge cases in
+         calculations.
+    */
+    *record_count_arg=1.0;
+  }
+
+  for (tab= first_depth_first_tab(this), i= const_tables;
+       tab;
+       tab= next_depth_first_tab(this, tab), i++)
+  {
+    end_tab= tab;
+    if (i == end_tab_idx)
+      break;
+  }
+
+  for (tab= first_depth_first_tab(this), i= const_tables;
+       (i <= end_tab_idx && tab);
+       tab= next_depth_first_tab(this, tab), i++)
   {
-    if (join->best_positions[i].records_read)
+    /* 
+      We've entered the SJM nest that contains the end_tab. The caller is
+      actually 
+      - interested in fanout inside the nest (because that's how many times 
+        we'll invoke the attached WHERE conditions)
+      - not interested in cost
+    */
+    if (end_tab->bush_root_tab && end_tab->bush_root_tab == tab)
+    {
+      /* Ok, end_tab is inside SJM nest and we're entering that nest now */
+      record_count= 1.0;
+      read_time= 0.0;
+    }
+    
+    /* 
+      Ignore fanout (but not cost) from sj-inner tables, as long as 
+      the range that processes them finishes before the end_tab
+    */
+    if (tab->sj_strategy != SJ_OPT_NONE)
+    {
+      sj_inner_fanout= 1.0;
+      last_sj_table= i + tab->n_sj_tables;
+    }
+
+    if (tab->records_read && (tab->table->map & filter_map))
+    {
+      record_count *= tab->records_read;
+      read_time += tab->read_time;
+      if (tab->emb_sj_nest)
+        sj_inner_fanout *= tab->records_read;
+    }
+
+    if (i == last_sj_table)
     {
-      record_count *= join->best_positions[i].records_read;
-      read_time += join->best_positions[i].read_time;
+      record_count /= sj_inner_fanout;
+      sj_inner_fanout= 1.0;
+      last_sj_table= MAX_TABLES;
     }
   }
   *read_time_arg= read_time;// + record_count / TIME_FOR_COMPARE;
@@ -5614,6 +6221,34 @@ void get_partial_join_cost(JOIN *join, uint n_tables, double *read_time_arg,
 }
 
 
+/*
+  Get prefix cost and fanout. This function is different from
+  get_partial_cost_and_fanout:
+   - it operates on a JOIN that haven't yet finished its optimization phase (in
+     particular, fix_semijoin_strategies_for_picked_join_order() and
+     get_best_combination() haven't been called)
+   - it assumes the the join prefix doesn't have any semi-join plans
+
+  These assumptions are met by the caller of the function.
+*/
+
+void JOIN::get_prefix_cost_and_fanout(uint n_tables, 
+                                      double *read_time_arg,
+                                      double *record_count_arg)
+{
+  double record_count= 1;
+  double read_time= 0.0;
+  for (uint i= const_tables; i < n_tables + const_tables ; i++)
+  {
+    if (best_positions[i].records_read)
+    {
+      record_count *= best_positions[i].records_read;
+      read_time += best_positions[i].read_time;
+    }
+  }
+  *read_time_arg= read_time;// + record_count / TIME_FOR_COMPARE;
+  *record_count_arg= record_count;
+}
 
 
 /**
@@ -5758,10 +6393,15 @@ best_extension_by_limited_search(JOIN      *join,
   JOIN_TAB *s;
   double best_record_count= DBL_MAX;
   double best_read_time=    DBL_MAX;
+  bool disable_jbuf= join->thd->variables.join_cache_level == 0;
 
   DBUG_EXECUTE("opt", print_plan(join, idx, record_count, read_time, read_time,
                                 "part_plan"););
 
+  /* 
+    If we are searching for the execution plan of a materialized semi-join nest
+    then allowed_tables contains bits only for the tables from this nest.
+  */
   table_map allowed_tables= ~(table_map)0;
   if (join->emb_sjm_nest)
     allowed_tables= join->emb_sjm_nest->sj_inner_tables & ~join->const_table_map;
@@ -5779,8 +6419,8 @@ best_extension_by_limited_search(JOIN      *join,
 
       /* Find the best access method from 's' to the current partial plan */
       POSITION loose_scan_pos;
-      best_access_path(join, s, remaining_tables, idx, FALSE, record_count, 
-                       join->positions + idx, &loose_scan_pos);
+      best_access_path(join, s, remaining_tables, idx, disable_jbuf,
+                       record_count, join->positions + idx, &loose_scan_pos);
 
       /* Compute the cost of extending the plan with 's' */
 
@@ -5820,7 +6460,7 @@ best_extension_by_limited_search(JOIN      *join,
           if (best_record_count >= current_record_count &&
               best_read_time >= current_read_time &&
               /* TODO: What is the reasoning behind this condition? */
-              (!(s->key_dependent & remaining_tables) ||
+              (!(s->key_dependent & allowed_tables & remaining_tables) ||
                join->positions[idx].records_read < 2.0))
           {
             best_record_count= current_record_count;
@@ -5868,6 +6508,7 @@ best_extension_by_limited_search(JOIN      *join,
         {
           memcpy((uchar*) join->best_positions, (uchar*) join->positions,
                  sizeof(POSITION) * (idx + 1));
+          join->record_count= current_record_count;
           join->best_read= current_read_time - 0.001;
         }
         DBUG_EXECUTE("opt", print_plan(join, idx+1,
@@ -5924,6 +6565,7 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
 
   JOIN_TAB *s;
   double best_record_count=DBL_MAX,best_read_time=DBL_MAX;
+  bool disable_jbuf= join->thd->variables.join_cache_level == 0;
   for (JOIN_TAB **pos=join->best_ref+idx ; (s=*pos) ; pos++)
   {
     table_map real_table_bit=s->table->map;
@@ -5932,7 +6574,7 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
     {
       double records, best;
       POSITION loose_scan_pos;
-      best_access_path(join, s, rest_tables, idx, FALSE, record_count, 
+      best_access_path(join, s, rest_tables, idx, disable_jbuf, record_count, 
                        join->positions + idx, &loose_scan_pos);
       records= join->positions[idx].records_read;
       best= join->positions[idx].read_time;
@@ -5976,15 +6618,16 @@ find_best(JOIN *join,table_map rest_tables,uint idx,double record_count,
   Find how much space the prevous read not const tables takes in cache.
 */
 
-void calc_used_field_length(THD *thd, JOIN_TAB *join_tab)
+void JOIN_TAB::calc_used_field_length(bool max_fl)
 {
-  uint null_fields,blobs,fields,rec_length;
+  uint null_fields,blobs,fields;
+  ulong rec_length;
   Field **f_ptr,*field;
   uint uneven_bit_fields;
-  MY_BITMAP *read_set= join_tab->table->read_set;
+  MY_BITMAP *read_set= table->read_set;
 
   uneven_bit_fields= null_fields= blobs= fields= rec_length=0;
-  for (f_ptr=join_tab->table->field ; (field= *f_ptr) ; f_ptr++)
+  for (f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
   {
     if (bitmap_is_set(read_set, field->field_index))
     {
@@ -6001,24 +6644,106 @@ void calc_used_field_length(THD *thd, JOIN_TAB *join_tab)
     }
   }
   if (null_fields || uneven_bit_fields)
-    rec_length+=(join_tab->table->s->null_fields+7)/8;
-  if (join_tab->table->maybe_null)
+    rec_length+=(table->s->null_fields+7)/8;
+  if (table->maybe_null)
     rec_length+=sizeof(my_bool);
-  if (blobs)
+  if (max_fl)
   {
-    uint blob_length=(uint) (join_tab->table->file->stats.mean_rec_length-
-			     (join_tab->table->s->reclength-rec_length));
-    rec_length+=(uint) max(4,blob_length);
-  }  
+    // TODO: to improve this estimate for max expected length 
+    if (blobs)
+    {
+      ulong blob_length= table->file->stats.mean_rec_length;
+      if (ULONG_MAX - rec_length > blob_length)
+        rec_length+=  blob_length;
+      else
+        rec_length= ULONG_MAX;
+    }
+    max_used_fieldlength= rec_length;
+  } 
+  else if (table->file->stats.mean_rec_length)           
+    set_if_smaller(rec_length, table->file->stats.mean_rec_length);
+      
   /*
-    psergey-todo: why we don't count here rowid that we might need to store
-    when using DuplicateElimination?
+    TODO: why we don't count here rowid that we might need to store when 
+    using DuplicateElimination?
   */
-  join_tab->used_fields=fields;
-  join_tab->used_fieldlength=rec_length;
-  join_tab->used_blobs=blobs;
-  join_tab->used_null_fields= null_fields;
-  join_tab->used_uneven_bit_fields= uneven_bit_fields;
+  used_fields=fields;
+  used_fieldlength=rec_length;
+  used_blobs=blobs;
+  used_null_fields= null_fields;
+  used_uneven_bit_fields= uneven_bit_fields;
+}
+
+
+/* 
+  @brief
+  Extract pushdown conditions for a table scan
+
+  @details
+  This functions extracts pushdown conditions usable when this table is scanned.
+  The conditions are extracted either from WHERE or from ON expressions.
+  The conditions are attached to the field cache_select of this table.
+
+  @note 
+  Currently the extracted conditions are used only by BNL and BNLH join.
+  algorithms.
+ 
+  @retval  0   on success
+           1   otherwise
+*/ 
+
+int JOIN_TAB::make_scan_filter()
+{
+  COND *tmp;
+  DBUG_ENTER("make_scan_filter");
+
+  Item *cond= is_inner_table_of_outer_join() ?
+                *get_first_inner_table()->on_expr_ref : join->conds;
+  
+  if (cond &&
+      (tmp= make_cond_for_table(join->thd, cond,
+                               join->const_table_map | table->map,
+			       table->map, MAX_TABLES, FALSE, TRUE)))
+  {
+     DBUG_EXECUTE("where",print_where(tmp,"cache", QT_ORDINARY););
+     if (!(cache_select=
+          (SQL_SELECT*) join->thd->memdup((uchar*) select, sizeof(SQL_SELECT))))
+	DBUG_RETURN(1);
+     cache_select->cond= tmp;
+     cache_select->read_tables=join->const_table_map;
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief
+  Check whether hash join algorithm can be used to join this table   
+
+  @details
+  This function finds out whether the ref items that have been chosen
+  by the planner to access this table can be used for hash join algorithms.
+  The answer depends on a certain property of the the fields of the
+  joined tables on which the hash join key is built.
+  
+  @note
+  At present the function is supposed to be called only after the function
+  get_best_combination has been called.
+
+  @retval TRUE    it's possible to use hash join to join this table
+  @retval FALSE   otherwise
+*/
+
+bool JOIN_TAB::hash_join_is_possible()
+{
+  if (type != JT_REF && type != JT_EQ_REF)
+    return FALSE;
+  if (!is_ref_for_hash_join())
+  {
+    KEY *keyinfo= table->key_info + ref.key;
+    return keyinfo->key_part[0].field->hash_join_is_possible();
+  }
+  return TRUE;
 }
 
 
@@ -6027,16 +6752,13 @@ cache_record_length(JOIN *join,uint idx)
 {
   uint length=0;
   JOIN_TAB **pos,**end;
-  THD *thd=join->thd;
 
   for (pos=join->best_ref+join->const_tables,end=join->best_ref+idx ;
        pos != end ;
        pos++)
   {
     JOIN_TAB *join_tab= *pos;
-    if (!join_tab->used_fieldlength)		/* Not calced yet */
-      calc_used_field_length(thd, join_tab);
-    length+=join_tab->used_fieldlength;
+    length+= join_tab->get_used_fieldlength();
   }
   return length;
 }
@@ -6093,12 +6815,12 @@ cache_record_length(JOIN *join,uint idx)
     Expected number of row combinations
 */
 
-static double
-prev_record_reads(JOIN *join, uint idx, table_map found_ref)
+double
+prev_record_reads(POSITION *positions, uint idx, table_map found_ref)
 {
   double found=1.0;
-  POSITION *pos_end= join->positions - 1;
-  for (POSITION *pos= join->positions + idx - 1; pos != pos_end; pos--)
+  POSITION *pos_end= positions - 1;
+  for (POSITION *pos= positions + idx - 1; pos != pos_end; pos--)
   {
     if (pos->table->table->map & found_ref)
     {
@@ -6128,6 +6850,210 @@ prev_record_reads(JOIN *join, uint idx, table_map found_ref)
 
 
 /*
+  Enumerate join tabs in breadth-first fashion, including const tables.
+*/
+
+JOIN_TAB *first_breadth_first_tab(JOIN *join)
+{
+  return join->join_tab; /* There's always one (i.e. first) table */
+}
+
+
+JOIN_TAB *next_breadth_first_tab(JOIN *join, JOIN_TAB *tab)
+{
+  if (!tab->bush_root_tab)
+  {
+    /* We're at top level. Get the next top-level tab */
+    tab++;
+    if (tab < join->join_tab + join->top_join_tab_count)
+      return tab;
+
+    /* No more top-level tabs. Switch to enumerating SJM nest children */
+    tab= join->join_tab;
+  }
+  else
+  {
+    /* We're inside of an SJM nest */
+    if (!tab->last_leaf_in_bush)
+    {
+      /* There's one more table in the nest, return it. */
+      return ++tab;
+    }
+    else
+    {
+      /* 
+        There are no more tables in this nest. Get out of it and then we'll
+        proceed to the next nest.
+      */
+      tab= tab->bush_root_tab + 1;
+    }
+  }
+   
+  /* 
+    Ok, "tab" points to a top-level table, and we need to find the next SJM
+    nest and enter it.
+  */
+  for (; tab < join->join_tab + join->top_join_tab_count; tab++)
+  {
+    if (tab->bush_children)
+      return tab->bush_children->start;
+  }
+  return NULL;
+}
+
+
+JOIN_TAB *first_top_level_tab(JOIN *join, enum enum_with_const_tables with_const)
+{
+  JOIN_TAB *tab= join->join_tab;
+  if (with_const == WITH_CONST_TABLES)
+  {
+    if (join->const_tables == join->table_count)
+      return NULL;
+    tab += join->const_tables;
+  }
+  return tab;
+}
+
+
+JOIN_TAB *next_top_level_tab(JOIN *join, JOIN_TAB *tab)
+{
+  tab= next_breadth_first_tab(join, tab);
+  if (tab && tab->bush_root_tab)
+    tab= NULL;
+  return tab;
+}
+
+
+JOIN_TAB *first_linear_tab(JOIN *join, enum enum_with_const_tables const_tbls)
+{
+  JOIN_TAB *first= join->join_tab;
+  if (const_tbls == WITHOUT_CONST_TABLES)
+    first+= join->const_tables;
+  if (first < join->join_tab + join->top_join_tab_count)
+    return first;
+  return NULL; /* All tables were const tables */
+}
+
+
+/*
+  A helper function to loop over all join's join_tab in sequential fashion
+
+  DESCRIPTION
+    Depending on include_bush_roots parameter, JOIN_TABs that represent
+    SJM-scan/lookups are either returned or omitted.
+
+    SJM-Bush children are returned right after (or in place of) their container
+    join tab (TODO: does anybody depend on this? A: make_join_readinfo() seems
+    to)
+
+    For example, if we have this structure:
+      
+       ot1--ot2--sjm1----------------ot3-...
+                  |
+                  +--it1--it2--it3
+
+    calls to next_linear_tab( include_bush_roots=TRUE) will return:
+      
+      ot1 ot2 sjm1 it1 it2 it3 ot3 ...
+   
+   while calls to next_linear_tab( include_bush_roots=FALSE) will return:
+
+      ot1 ot2 it1 it2 it3 ot3 ...
+
+   (note that sjm1 won't be returned).
+*/
+
+JOIN_TAB *next_linear_tab(JOIN* join, JOIN_TAB* tab, 
+                          enum enum_with_bush_roots include_bush_roots)
+{
+  if (include_bush_roots == WITH_BUSH_ROOTS && tab->bush_children)
+  {
+    /* This JOIN_TAB is a SJM nest; Start from first table in nest */
+    return tab->bush_children->start;
+  }
+
+  DBUG_ASSERT(!tab->last_leaf_in_bush || tab->bush_root_tab);
+
+  if (tab->bush_root_tab)       /* Are we inside an SJM nest */
+  {
+    /* Inside SJM nest */
+    if (!tab->last_leaf_in_bush)
+      return tab+1;              /* Return next in nest */
+    /* Continue from the sjm on the top level */
+    tab= tab->bush_root_tab;
+  }
+
+  /* If no more JOIN_TAB's on the top level */
+  if (++tab == join->join_tab + join->top_join_tab_count)
+    return NULL;
+
+  if (include_bush_roots == WITHOUT_BUSH_ROOTS && tab->bush_children)
+  {
+    /* This JOIN_TAB is a SJM nest; Start from first table in nest */
+    tab= tab->bush_children->start;
+  }
+  return tab;
+}
+
+
+/*
+  Start to iterate over all join tables in bush-children-first order, excluding 
+  the const tables (see next_depth_first_tab() comment for details)
+*/
+
+JOIN_TAB *first_depth_first_tab(JOIN* join)
+{
+  JOIN_TAB* tab;
+  /* This means we're starting the enumeration */
+  if (join->const_tables == join->top_join_tab_count)
+    return NULL;
+
+  tab= join->join_tab + join->const_tables;
+
+  return (tab->bush_children) ? tab->bush_children->start : tab;
+}
+
+
+/*
+  A helper function to iterate over all join tables in bush-children-first order
+
+  DESCRIPTION
+   
+  For example, for this join plan
+
+    ot1--ot2--sjm1------------ot3-...
+               |
+               |
+              it1--it2--it3 
+  
+  call to first_depth_first_tab() will return ot1, and subsequent calls to
+  next_depth_first_tab() will return:
+
+     ot2 it1 it2 it3 sjm ot3 ...
+*/
+
+JOIN_TAB *next_depth_first_tab(JOIN* join, JOIN_TAB* tab)
+{
+  /* If we're inside SJM nest and have reached its end, get out */
+  if (tab->last_leaf_in_bush)
+    return tab->bush_root_tab;
+  
+  /* Move to next tab in the array we're traversing */
+  tab++;
+  
+  if (tab == join->join_tab +join->top_join_tab_count)
+    return NULL; /* Outside SJM nest and reached EOF */
+
+  if (tab->bush_children)
+    return tab->bush_children->start;
+
+  return tab;
+}
+
+
+static Item * const null_ptr= NULL;
+
+/*
   Set up join struct according to the picked join order in
   
   SYNOPSIS
@@ -6142,6 +7068,11 @@ prev_record_reads(JOIN *join, uint idx, table_map found_ref)
     - create join->join_tab array and put there the JOIN_TABs in the join order
     - create data structures describing ref access methods.
 
+  NOTE
+    In this function we switch from pre-join-optimization JOIN_TABs to
+    post-join-optimization JOIN_TABs. This is achieved by copying the entire
+    JOIN_TAB objects.
+ 
   RETURN 
     FALSE  OK
     TRUE   Out of memory
@@ -6150,7 +7081,7 @@ prev_record_reads(JOIN *join, uint idx, table_map found_ref)
 static bool
 get_best_combination(JOIN *join)
 {
-  uint i,tablenr;
+  uint tablenr;
   table_map used_tables;
   JOIN_TAB *join_tab,*j;
   KEYUSE *keyuse;
@@ -6158,7 +7089,7 @@ get_best_combination(JOIN *join)
   THD *thd=join->thd;
   DBUG_ENTER("get_best_combination");
 
-  table_count=join->tables;
+  table_count=join->table_count;
   if (!(join->join_tab=join_tab=
 	(JOIN_TAB*) thd->alloc(sizeof(JOIN_TAB)*table_count)))
     DBUG_RETURN(TRUE);
@@ -6169,26 +7100,87 @@ get_best_combination(JOIN *join)
 
   fix_semijoin_strategies_for_picked_join_order(join);
   
+  JOIN_TAB_RANGE *root_range;
+  if (!(root_range= new JOIN_TAB_RANGE))
+    DBUG_RETURN(TRUE);
+  root_range->start= join->join_tab;
+  /* root_range->end will be set later */
+  join->join_tab_ranges.empty();
+
+  if (join->join_tab_ranges.push_back(root_range))
+    DBUG_RETURN(TRUE);
+
+  JOIN_TAB *sjm_nest_end= NULL;
+  JOIN_TAB *sjm_nest_root= NULL;
+
   for (j=join_tab, tablenr=0 ; tablenr < table_count ; tablenr++,j++)
   {
     TABLE *form;
+    POSITION *cur_pos= &join->best_positions[tablenr];
+    if (cur_pos->sj_strategy == SJ_OPT_MATERIALIZE || 
+        cur_pos->sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
+    {
+      /*
+        Ok, we've entered an SJ-Materialization semi-join (note that this can't
+        be done recursively, semi-joins are not allowed to be nested).
+        1. Put into main join order a JOIN_TAB that represents a lookup or scan
+           in the temptable.
+      */
+      bzero(j, sizeof(JOIN_TAB));
+      j->join= join;
+      j->table= NULL; //temporary way to tell SJM tables from others.
+      j->ref.key = -1;
+      j->on_expr_ref= (Item**) &null_ptr;
+      j->keys= key_map(1); /* The unique index is always in 'possible keys' in EXPLAIN */
+
+      /*
+        2. Proceed with processing SJM nest's join tabs, putting them into the
+           sub-order
+      */
+      SJ_MATERIALIZATION_INFO *sjm= cur_pos->table->emb_sj_nest->sj_mat_info;
+      j->records= j->records_read= (ha_rows)(sjm->is_sj_scan? sjm->rows : 1);
+      JOIN_TAB *jt;
+      JOIN_TAB_RANGE *jt_range;
+      if (!(jt= (JOIN_TAB*)join->thd->alloc(sizeof(JOIN_TAB)*sjm->tables)) ||
+          !(jt_range= new JOIN_TAB_RANGE))
+        DBUG_RETURN(TRUE);
+      jt_range->start= jt;
+      jt_range->end= jt + sjm->tables;
+      join->join_tab_ranges.push_back(jt_range);
+      j->bush_children= jt_range;
+      sjm_nest_end= jt + sjm->tables;
+      sjm_nest_root= j;
+
+      j= jt;
+    }
+    
     *j= *join->best_positions[tablenr].table;
-    form=join->all_tables[tablenr]=j->table;
+
+#if 0
+/* SJ-Materialization is represented with join tab ranges */
+    if (j->sj_strategy == SJ_OPT_MATERIALIZE || 
+        j->sj_strategy == SJ_OPT_MATERIALIZE)
+      j->sj_strategy= SJ_OPT_NONE;  
+#endif
+
+    j->bush_root_tab= sjm_nest_root;
+
+    form=join->table[tablenr]=j->table;
     used_tables|= form->map;
     form->reginfo.join_tab=j;
     if (!*j->on_expr_ref)
       form->reginfo.not_exists_optimize=0;	// Only with LEFT JOIN
     DBUG_PRINT("info",("type: %d", j->type));
     if (j->type == JT_CONST)
-      continue;					// Handled in make_join_stat..
+      goto loop_end;					// Handled in make_join_stat..
 
     j->loosescan_match_tab= NULL;  //non-nulls will be set later
     j->ref.key = -1;
     j->ref.key_parts=0;
 
     if (j->type == JT_SYSTEM)
-      continue;
-    if (j->keys.is_clear_all() || !(keyuse= join->best_positions[tablenr].key) || 
+      goto loop_end;
+    if ( !(keyuse= join->best_positions[tablenr].key) || 
         (join->best_positions[tablenr].sj_strategy == SJ_OPT_LOOSE_SCAN))
     {
       j->type=JT_ALL;
@@ -6198,30 +7190,160 @@ get_best_combination(JOIN *join)
     }
     else if (create_ref_for_key(join, j, keyuse, used_tables))
       DBUG_RETURN(TRUE);                        // Something went wrong
+  loop_end:
+    /* 
+      Save records_read in JOIN_TAB so that select_describe()/etc don't have
+      to access join->best_positions[]. 
+    */
+    j->records_read= (ha_rows)join->best_positions[tablenr].records_read;
+    join->map2table[j->table->tablenr]= j;
+
+    /* If we've reached the end of sjm nest, switch back to main sequence */
+    if (j + 1 == sjm_nest_end)
+    {
+      j->last_leaf_in_bush= TRUE;
+      j= sjm_nest_root;
+      sjm_nest_root= NULL;
+      sjm_nest_end= NULL;
+    }
   }
+  root_range->end= j;
 
-  for (i=0 ; i < table_count ; i++)
-    join->map2table[join->join_tab[i].table->tablenr]=join->join_tab+i;
+  join->top_join_tab_count= join->join_tab_ranges.head()->end - 
+                            join->join_tab_ranges.head()->start;
   update_depend_map(join);
   DBUG_RETURN(0);
 }
 
+/**
+  Create a descriptor of hash join key to access a given join table  
 
-static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
-			       table_map used_tables)
+  @param   join         join which the join table belongs to
+  @param   join_tab     the join table to access
+  @param   org_keyuse   beginning of the key uses to join this table
+  @param   used_tables  bitmap of the previous tables
+
+  @details
+  This function first finds key uses that can be utilized by the hash join
+  algorithm to join join_tab to the previous tables marked in the bitmap 
+  used_tables.  The tested key uses are taken from the array of all key uses
+  for 'join' starting from the position org_keyuse. After all interesting key
+  uses have been found the function builds a descriptor of the corresponding
+  key that is used by the hash join algorithm would it be chosen to join
+  the table join_tab.
+
+  @retval  FALSE  the descriptor for a hash join key is successfully created
+  @retval  TRUE   otherwise
+*/
+
+static bool create_hj_key_for_table(JOIN *join, JOIN_TAB *join_tab,
+                                    KEYUSE *org_keyuse, table_map used_tables)
 {
-  KEYUSE *keyuse=org_keyuse;
-  bool ftkey=(keyuse->keypart == FT_KEYPART);
+  KEY *keyinfo;
+  KEY_PART_INFO *key_part_info;
+  KEYUSE *keyuse= org_keyuse;
+  uint key_parts= 0;
   THD  *thd= join->thd;
-  uint keyparts,length,key;
+  TABLE *table= join_tab->table;
+  bool first_keyuse= TRUE;
+  DBUG_ENTER("create_hj_key_for_table");
+
+  do
+  {
+    if (!(~used_tables & keyuse->used_tables) &&
+	(first_keyuse || keyuse->keypart != (keyuse-1)->keypart))
+      key_parts++;
+    first_keyuse= FALSE;
+    keyuse++;
+  } while (keyuse->table == table && keyuse->is_for_hash_join());
+  if (!key_parts)
+    DBUG_RETURN(TRUE);
+  /* This memory is allocated only once for the joined table join_tab */
+  if (!(keyinfo= (KEY *) thd->alloc(sizeof(KEY))) ||
+      !(key_part_info = (KEY_PART_INFO *) thd->alloc(sizeof(KEY_PART_INFO)*
+                                                     key_parts)))
+    DBUG_RETURN(TRUE);
+  keyinfo->usable_key_parts= keyinfo->key_parts = key_parts;
+  keyinfo->key_part= key_part_info;
+  keyinfo->key_length=0;
+  keyinfo->algorithm= HA_KEY_ALG_UNDEF;
+  keyinfo->flags= HA_GENERATED_KEY;
+  keyinfo->name= (char *) "$hj";
+  keyinfo->rec_per_key= (ulong*) thd->calloc(sizeof(ulong)*key_parts);
+  if (!keyinfo->rec_per_key)
+    DBUG_RETURN(TRUE);
+  keyinfo->key_part= key_part_info;
+
+  first_keyuse= TRUE;
+  keyuse= org_keyuse;
+  do
+  {
+    if (!(~used_tables & keyuse->used_tables) &&
+        (first_keyuse || keyuse->keypart != (keyuse-1)->keypart))
+    {
+      Field *field= table->field[keyuse->keypart];
+      uint fieldnr= keyuse->keypart+1;
+      table->create_key_part_by_field(keyinfo, key_part_info, field, fieldnr);
+      first_keyuse= FALSE;
+      key_part_info++;
+    }
+    keyuse++;
+  } while (keyuse->table == table && keyuse->is_for_hash_join());
+
+  join_tab->hj_key= keyinfo;
+
+  DBUG_RETURN(FALSE);
+}
+
+/* 
+  Check if a set of tables specified by used_tables can be accessed when
+  we're doing scan on join_tab jtab.
+*/
+static bool are_tables_local(JOIN_TAB *jtab, table_map used_tables)
+{
+  if (jtab->bush_root_tab)
+  {
+    /*
+      jtab is inside execution join nest. We may not refer to outside tables,
+      except the const tables.
+    */
+    table_map local_tables= jtab->emb_sj_nest->nested_join->used_tables |
+                            jtab->join->const_table_map;
+    return !test(used_tables & ~local_tables);
+  }
+
+  /* 
+    If we got here then jtab is at top level. 
+     - all other tables at top level are accessible,
+     - tables in join nests are accessible too, because all their columns that 
+       are needed at top level will be unpacked when scanning the
+       materialization table.
+  */
+  return TRUE;
+}
+
+static bool create_ref_for_key(JOIN *join, JOIN_TAB *j,
+                               KEYUSE *org_keyuse, table_map used_tables)
+{
+  uint keyparts, length, key;
   TABLE *table;
   KEY *keyinfo;
+  KEYUSE *keyuse= org_keyuse;
+  bool ftkey= (keyuse->keypart == FT_KEYPART);
+  THD *thd= join->thd;
   DBUG_ENTER("create_ref_for_key");
 
   /*  Use best key from find_best */
-  table=j->table;
-  key=keyuse->key;
-  keyinfo=table->key_info+key;
+  table= j->table;
+  key= keyuse->key;
+  if (!is_hash_join_key_no(key))
+    keyinfo= table->key_info+key;
+  else
+  {
+    if (create_hj_key_for_table(join, j, org_keyuse, used_tables))
+      DBUG_RETURN(TRUE);
+    keyinfo= j->hj_key;
+  }
 
   if (ftkey)
   {
@@ -6244,27 +7366,31 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
     {
       if (!(~used_tables & keyuse->used_tables))
       {
-	if (keyparts == keyuse->keypart &&
-	    !(found_part_ref_or_null & keyuse->optimize))
-	{
-	  keyparts++;
-	  length+= keyinfo->key_part[keyuse->keypart].store_length;
-	  found_part_ref_or_null|= keyuse->optimize;
-	}
+        if  (are_tables_local(j, keyuse->val->used_tables()))
+        {
+          if ((is_hash_join_key_no(key) && 
+              (keyparts == 0 || keyuse->keypart != (keyuse-1)->keypart)) ||
+              (!is_hash_join_key_no(key) && keyparts == keyuse->keypart &&
+               !(found_part_ref_or_null & keyuse->optimize)))
+          {
+             length+= keyinfo->key_part[keyparts].store_length;
+             keyparts++;
+             found_part_ref_or_null|= keyuse->optimize & ~KEY_OPTIMIZE_EQ;
+          }
+        }
       }
       keyuse++;
     } while (keyuse->table == table && keyuse->key == key);
   } /* not ftkey */
 
   /* set up fieldref */
-  keyinfo=table->key_info+key;
-  j->ref.key_parts=keyparts;
-  j->ref.key_length=length;
-  j->ref.key=(int) key;
+  j->ref.key_parts= keyparts;
+  j->ref.key_length= length;
+  j->ref.key= (int) key;
   if (!(j->ref.key_buff= (uchar*) thd->calloc(ALIGN_SIZE(length)*2)) ||
       !(j->ref.key_copy= (store_key**) thd->alloc((sizeof(store_key*) *
-						   (keyparts+1)))) ||
-      !(j->ref.items=    (Item**) thd->alloc(sizeof(Item*)*keyparts)) ||
+						          (keyparts+1)))) ||
+      !(j->ref.items=(Item**) thd->alloc(sizeof(Item*)*keyparts)) ||
       !(j->ref.cond_guards= (bool**) thd->alloc(sizeof(uint*)*keyparts)))
   {
     DBUG_RETURN(TRUE);
@@ -6274,10 +7400,12 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
   j->ref.has_record= FALSE;
   j->ref.null_rejecting= 0;
   j->ref.disable_cache= FALSE;
+  j->ref.null_ref_part= NO_REF_PART;
   keyuse=org_keyuse;
 
   store_key **ref_key= j->ref.key_copy;
   uchar *key_buff=j->ref.key_buff, *null_ref_key= 0;
+  uint null_ref_part= NO_REF_PART;
   bool keyuse_uses_no_tables= TRUE;
   if (ftkey)
   {
@@ -6294,9 +7422,11 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
     uint i;
     for (i=0 ; i < keyparts ; keyuse++,i++)
     {
-      while (keyuse->keypart != i ||
-	     ((~used_tables) & keyuse->used_tables))
-	keyuse++;				/* Skip other parts */
+      while (((~used_tables) & keyuse->used_tables) || 
+	     (keyuse->keypart != 
+              (is_hash_join_key_no(key) ?
+                 keyinfo->key_part[i].field->field_index : i))) 
+	 keyuse++;                              	/* Skip other parts */ 
 
       uint maybe_null= test(keyinfo->key_part[i].null_bit);
       j->ref.items[i]=keyuse->val;		// Save for cond removal
@@ -6304,13 +7434,14 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
       if (keyuse->null_rejecting) 
         j->ref.null_rejecting |= 1 << i;
       keyuse_uses_no_tables= keyuse_uses_no_tables && !keyuse->used_tables;
-      if (!keyuse->used_tables &&
-	  !(join->select_options & SELECT_DESCRIBE))
+      if (!keyuse->used_tables && !thd->lex->describe)
       {					// Compare against constant
-	store_key_item tmp(thd, keyinfo->key_part[i].field,
+	store_key_item tmp(thd, 
+                           keyinfo->key_part[i].field,
                            key_buff + maybe_null,
                            maybe_null ?  key_buff : 0,
-                           keyinfo->key_part[i].length, keyuse->val,
+                           keyinfo->key_part[i].length,
+                           keyuse->val,
                            FALSE);
 	if (thd->is_fatal_error)
 	  DBUG_RETURN(TRUE);
@@ -6327,8 +7458,11 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
 	instead of JT_REF_OR_NULL in case if field can't be null
       */
       if ((keyuse->optimize & KEY_OPTIMIZE_REF_OR_NULL) && maybe_null)
+      {
 	null_ref_key= key_buff;
-      key_buff+=keyinfo->key_part[i].store_length;
+        null_ref_part= i;
+      }
+      key_buff+= keyinfo->key_part[i].store_length;
     }
   } /* not ftkey */
   *ref_key=0;				// end_marker
@@ -6342,6 +7476,7 @@ static bool create_ref_for_key(JOIN *join, JOIN_TAB *j, KEYUSE *org_keyuse,
     /* Must read with repeat */
     j->type= null_ref_key ? JT_REF_OR_NULL : JT_REF;
     j->ref.null_ref_key= null_ref_key;
+    j->ref.null_ref_part= null_ref_part;
   }
   else if (keyuse_uses_no_tables)
   {
@@ -6376,9 +7511,10 @@ get_store_key(THD *thd, KEYUSE *keyuse, table_map used_tables,
   }
   else if (keyuse->val->type() == Item::FIELD_ITEM ||
            (keyuse->val->type() == Item::REF_ITEM &&
-            ((Item_ref*)keyuse->val)->ref_type() == Item_ref::OUTER_REF &&
-            (*(Item_ref**)((Item_ref*)keyuse->val)->ref)->ref_type() ==
-             Item_ref::DIRECT_REF && 
+	    ((((Item_ref*)keyuse->val)->ref_type() == Item_ref::OUTER_REF &&
+              (*(Item_ref**)((Item_ref*)keyuse->val)->ref)->ref_type() ==
+              Item_ref::DIRECT_REF) || 
+             ((Item_ref*)keyuse->val)->ref_type() == Item_ref::VIEW_REF) &&
             keyuse->val->real_item()->type() == Item::FIELD_ITEM))
     return new store_key_field(thd,
 			       key_part->field,
@@ -6386,7 +7522,7 @@ get_store_key(THD *thd, KEYUSE *keyuse, table_map used_tables,
 			       maybe_null ? key_buff : 0,
 			       key_part->length,
 			       ((Item_field*) keyuse->val->real_item())->field,
-			       keyuse->val->full_name());
+			       keyuse->val->real_item()->full_name());
   return new store_key_item(thd,
 			    key_part->field,
 			    key_buff + maybe_null,
@@ -6450,8 +7586,9 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
     DBUG_RETURN(TRUE);                        /* purecov: inspected */
 
   join_tab= parent->join_tab_reexec;
-  parent->table_reexec[0]= temp_table;
-  tables= 1;
+  table= &parent->table_reexec[0]; parent->table_reexec[0]= temp_table;
+  table_count= top_join_tab_count= 1;
+
   const_tables= 0;
   const_table_map= 0;
   eliminated_tables= 0;
@@ -6475,8 +7612,8 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
   join_tab->table=temp_table;
   join_tab->cache_select= 0;
   join_tab->select=0;
+  join_tab->select_cond= 0;                     // Avoid valgrind warning
   join_tab->set_select_cond(NULL, __LINE__);
-  join_tab->select_cond=0;
   join_tab->quick=0;
   join_tab->type= JT_ALL;			/* Map through all records */
   join_tab->keys.init();
@@ -6488,6 +7625,7 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
   join_tab->ref.key = -1;
   join_tab->not_used_in_distinct=0;
   join_tab->read_first_record= join_init_read_record;
+  join_tab->preread_init_done= FALSE;
   join_tab->join= this;
   join_tab->ref.key_parts= 0;
   join_tab->keep_current_rowid= FALSE;
@@ -6495,6 +7633,10 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
   join_tab->do_firstmatch= NULL;
   join_tab->loosescan_match_tab= NULL;
   join_tab->emb_sj_nest= NULL;
+  join_tab->pre_idx_push_select_cond= NULL;
+  join_tab->bush_root_tab= NULL;
+  join_tab->bush_children= NULL;
+  join_tab->last_leaf_in_bush= FALSE;
   bzero((char*) &join_tab->read_record,sizeof(join_tab->read_record));
   temp_table->status=0;
   temp_table->null_row=0;
@@ -6502,15 +7644,17 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table)
 }
 
 
-inline void add_cond_and_fix(Item **e1, Item *e2)
+inline void add_cond_and_fix(THD *thd, Item **e1, Item *e2)
 {
   if (*e1)
   {
+    if (!e2)
+      return;
     Item *res;
     if ((res= new Item_cond_and(*e1, e2)))
     {
       *e1= res;
-      res->quick_fix_field();
+      res->fix_fields(thd, 0);
       res->update_used_tables();
     }
   }
@@ -6573,12 +7717,13 @@ inline void add_cond_and_fix(Item **e1, Item *e2)
 static void add_not_null_conds(JOIN *join)
 {
   DBUG_ENTER("add_not_null_conds");
-  for (uint i=join->const_tables ; i < join->tables ; i++)
+  
+  for (JOIN_TAB *tab= first_linear_tab(join, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
   {
-    JOIN_TAB *tab=join->join_tab+i;
-    if ((tab->type == JT_REF || tab->type == JT_EQ_REF || 
-         tab->type == JT_REF_OR_NULL) &&
-        !tab->table->maybe_null)
+    if (tab->type == JT_REF || tab->type == JT_EQ_REF || 
+        tab->type == JT_REF_OR_NULL)
     {
       for (uint keypart= 0; keypart < tab->ref.key_parts; keypart++)
       {
@@ -6595,7 +7740,7 @@ static void add_not_null_conds(JOIN *join)
             UPDATE t1 SET t1.f2=(SELECT MAX(t2.f4) FROM t2 WHERE t2.f3=t1.f1);
             not_null_item is the t1.f1, but it's referred_tab is 0.
           */
-          if (!referred_tab || referred_tab->join != join)
+          if (!referred_tab)
             continue;
           if (!(notnull= new Item_func_isnotnull(not_null_item)))
             DBUG_VOID_RETURN;
@@ -6608,11 +7753,21 @@ static void add_not_null_conds(JOIN *join)
           if (notnull->fix_fields(join->thd, &notnull))
             DBUG_VOID_RETURN;
           DBUG_EXECUTE("where",print_where(notnull,
-                                           referred_tab->table->alias,
+                                           referred_tab->table->alias.c_ptr(),
                                            QT_ORDINARY););
-          COND *new_cond= referred_tab->select_cond;
-          add_cond_and_fix(&new_cond, notnull);
-          referred_tab->set_select_cond(new_cond, __LINE__);
+          if (!tab->first_inner)
+	  {
+            COND *new_cond= referred_tab->join == join ? 
+                              referred_tab->select_cond :
+                              join->outer_ref_cond;
+            add_cond_and_fix(join->thd, &new_cond, notnull);
+            if (referred_tab->join == join)
+              referred_tab->set_select_cond(new_cond, __LINE__);
+            else 
+              join->outer_ref_cond= new_cond;
+          }
+          else
+            add_cond_and_fix(join->thd, tab->first_inner->on_expr_ref, notnull);
         }
       }
     }
@@ -6627,6 +7782,12 @@ static void add_not_null_conds(JOIN *join)
   nested outer join and so on until it reaches root_tab
   (root_tab can be 0).
 
+  In other words:
+  add_found_match_trig_cond(tab->first_inner_tab, y, 0) is the way one should 
+  wrap parts of WHERE.  The idea is that the part of WHERE should be only
+  evaluated after we've finished figuring out whether outer joins.
+  ^^^ is the above correct?
+
   @param tab       the first inner table for most nested outer join
   @param cond      the predicate to be guarded (must be set)
   @param root_tab  the first inner table to stop
@@ -6654,6 +7815,12 @@ add_found_match_trig_cond(JOIN_TAB *tab, COND *cond, JOIN_TAB *root_tab)
 }
 
 
+bool TABLE_LIST::is_active_sjm()
+{ 
+  return sj_mat_info && sj_mat_info->is_used;
+}
+
+
 /**
   Fill in outer join related info for the execution plan structure.
 
@@ -6671,6 +7838,12 @@ add_found_match_trig_cond(JOIN_TAB *tab, COND *cond, JOIN_TAB *root_tab)
     corresponding first inner table through the field t0->on_expr_ref.
     Here ti are structures of the JOIN_TAB type.
 
+    In other words, for each join tab, set
+     - first_inner
+     - last_inner
+     - first_upper
+     - on_expr_ref, cond_equal
+
   EXAMPLE. For the query: 
   @code
         SELECT * FROM t1
@@ -6695,14 +7868,34 @@ add_found_match_trig_cond(JOIN_TAB *tab, COND *cond, JOIN_TAB *root_tab)
     This function can be called only after the execution plan
     has been chosen.
 */
-static void
+
+static bool
 make_outerjoin_info(JOIN *join)
 {
   DBUG_ENTER("make_outerjoin_info");
-  for (uint i=join->const_tables ; i < join->tables ; i++)
+  
+  /*
+    Create temp. tables for merged SJ-Materialization nests. We need to do
+    this now, because further code relies on tab->table and
+    tab->table->pos_in_table_list being set.
+  */
+  JOIN_TAB *tab;
+  for (tab= first_linear_tab(join, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
   {
-    JOIN_TAB *tab=join->join_tab+i;
-    TABLE *table=tab->table;
+    if (tab->bush_children)
+    {
+      if (setup_sj_materialization_part1(tab))
+        DBUG_RETURN(TRUE);
+      tab->table->reginfo.join_tab= tab;
+    }
+  }
+
+  for (JOIN_TAB *tab= first_linear_tab(join, WITHOUT_CONST_TABLES); tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
+  {
+    TABLE *table= tab->table;
     TABLE_LIST *tbl= table->pos_in_table_list;
     TABLE_LIST *embedding= tbl->embedding;
 
@@ -6716,13 +7909,18 @@ make_outerjoin_info(JOIN *join)
       tab->last_inner= tab->first_inner= tab;
       tab->on_expr_ref= &tbl->on_expr;
       tab->cond_equal= tbl->cond_equal;
-      if (embedding)
+      if (embedding && !embedding->is_active_sjm())
         tab->first_upper= embedding->nested_join->first_nested;
     }    
     for ( ; embedding ; embedding= embedding->embedding)
     {
+      if (embedding->is_active_sjm())
+      {
+        /* We're trying to walk out of an SJ-Materialization nest. Don't do this.  */
+        break;
+      }
       /* Ignore sj-nests: */
-      if (!embedding->on_expr)
+      if (!(embedding->on_expr && embedding->outer_join))
         continue;
       NESTED_JOIN *nested_join= embedding->nested_join;
       if (!nested_join->counter)
@@ -6752,7 +7950,7 @@ make_outerjoin_info(JOIN *join)
       } 
     }
   }
-  DBUG_VOID_RETURN;
+  DBUG_RETURN(FALSE);
 }
 
 
@@ -6773,9 +7971,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
     */
     if (cond)                /* Because of QUICK_GROUP_MIN_MAX_SELECT */
     {                        /* there may be a select without a cond. */    
-      if (join->tables > 1)
+      if (join->table_count > 1)
         cond->update_used_tables();		// Tablenr may have changed
-      if (join->const_tables == join->tables &&
+      if (join->const_tables == join->table_count &&
 	  thd->lex->current_select->master_unit() ==
 	  &thd->lex->unit)		// not upper level SELECT
         join->const_table_map|=RAND_TABLE_BIT;
@@ -6788,39 +7986,33 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
            there inside the triggers.
       */
       {						// Check const tables
-        COND *const_cond=
-	  make_cond_for_table(cond,
+        join->exec_const_cond=
+	  make_cond_for_table(thd, cond,
                               join->const_table_map,
-                              (table_map) 0, TRUE);
-        DBUG_EXECUTE("where",print_where(const_cond,"constants", QT_ORDINARY););
-        for (JOIN_TAB *tab= join->join_tab+join->const_tables;
-             tab < join->join_tab+join->tables ; tab++)
+                              (table_map) 0, MAX_TABLES, FALSE, FALSE);
+        /* Add conditions added by add_not_null_conds(). */
+        for (uint i= 0 ; i < join->const_tables ; i++)
+          add_cond_and_fix(thd, &join->exec_const_cond,
+                           join->join_tab[i].select_cond);
+
+        DBUG_EXECUTE("where",print_where(join->exec_const_cond,"constants",
+					 QT_ORDINARY););
+        if (join->exec_const_cond && !join->exec_const_cond->is_expensive() &&
+            !join->exec_const_cond->val_int())
         {
-          if (*tab->on_expr_ref)
-          {
-            JOIN_TAB *cond_tab= tab->first_inner;
-            COND *tmp= make_cond_for_table(*tab->on_expr_ref,
-                                           join->const_table_map,
-                                         (  table_map) 0, FALSE);
-            if (!tmp)
-              continue;
-            tmp= new Item_func_trig_cond(tmp, &cond_tab->not_null_compl);
-            if (!tmp)
-              DBUG_RETURN(1);
-            tmp->quick_fix_field();
-            COND *new_cond= !cond_tab->select_cond ? tmp :
-              new Item_cond_and(cond_tab->select_cond, tmp);
-            cond_tab->set_select_cond(new_cond, __LINE__);
-            if (!cond_tab->select_cond)
-	      DBUG_RETURN(1);
-            cond_tab->select_cond->update_used_tables();
-            cond_tab->select_cond->quick_fix_field();
-          }       
+          DBUG_PRINT("info",("Found impossible WHERE condition"));
+          join->exec_const_cond= NULL;
+          DBUG_RETURN(1);	 // Impossible const condition
         }
-        if (const_cond && !const_cond->val_int())
-        {
-	  DBUG_PRINT("info",("Found impossible WHERE condition"));
-	  DBUG_RETURN(1);	 // Impossible const condition
+
+        COND *outer_ref_cond= make_cond_for_table(thd, cond, 
+                                                  OUTER_REF_TABLE_BIT,
+                                                  OUTER_REF_TABLE_BIT,
+                                                  MAX_TABLES, FALSE, FALSE);
+        if (outer_ref_cond)
+	{
+          add_cond_and_fix(thd, &outer_ref_cond, join->outer_ref_cond);
+          join->outer_ref_cond= outer_ref_cond;
         }
       }
     }
@@ -6833,15 +8025,22 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 		 OUTER_REF_TABLE_BIT | RAND_TABLE_BIT);
     JOIN_TAB *tab;
     table_map current_map;
-    for (uint i=join->const_tables ; i < join->tables ; i++)
+    uint i= join->const_tables;
+    for (tab= first_depth_first_tab(join); tab;
+         tab= next_depth_first_tab(join, tab), i++)
     {
-      tab= join->join_tab+i;
+      bool is_hj;
       /*
         first_inner is the X in queries like:
         SELECT * FROM t1 LEFT OUTER JOIN (t2 JOIN t3) ON X
       */
-      JOIN_TAB *first_inner_tab= tab->first_inner; 
-      current_map= tab->table->map;
+      JOIN_TAB *first_inner_tab= tab->first_inner;
+
+      if (tab->table)
+        current_map= tab->table->map;
+      else
+        current_map= tab->bush_children->start->emb_sj_nest->sj_inner_tables;
+
       bool use_quick_range=0;
       COND *tmp;
 
@@ -6864,13 +8063,14 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	Following force including random expression in last table condition.
 	It solve problem with select like SELECT * FROM t1 WHERE rand() > 0.5
       */
-      if (i == join->tables-1)
+      if (tab == join->join_tab + join->top_join_tab_count - 1)
 	current_map|= OUTER_REF_TABLE_BIT | RAND_TABLE_BIT;
       used_tables|=current_map;
 
       if (tab->type == JT_REF && tab->quick &&
-	  (uint) tab->ref.key == tab->quick->index &&
-	  tab->ref.key_length < tab->quick->max_used_key_length)
+	  (((uint) tab->ref.key == tab->quick->index &&
+	    tab->ref.key_length < tab->quick->max_used_key_length) ||
+	    tab->table->intersect_keys.is_set(tab->ref.key)))
       {
 	/* Range uses longer key;  Use this instead of ref on key */
 	tab->type=JT_ALL;
@@ -6883,16 +8083,44 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           We will use join cache here : prevent sorting of the first
           table only and sort at the end.
         */
-        if (i != join->const_tables && join->tables > join->const_tables + 1)
+        if (i != join->const_tables && join->table_count > join->const_tables + 1)
           join->full_join= 1;
       }
 
       tmp= NULL;
+
       if (cond)
-        tmp= make_cond_for_table(cond, used_tables, current_map, FALSE);
+      {
+        if (tab->bush_children)
+        {
+          // Reached the materialization tab
+          tmp= make_cond_after_sjm(cond, cond, save_used_tables, used_tables);
+          used_tables= save_used_tables | used_tables;
+          save_used_tables= 0;
+        }
+        else
+         {
+	  tmp= make_cond_for_table(thd, cond, used_tables, current_map, i,
+                                   FALSE, FALSE);
+         }
+        /* Add conditions added by add_not_null_conds(). */
+        if (tab->select_cond)
+          add_cond_and_fix(thd, &tmp, tab->select_cond);
+      }
+
+      is_hj= (tab->type == JT_REF || tab->type == JT_EQ_REF) &&
+             (join->allowed_join_cache_types & JOIN_CACHE_HASHED_BIT) &&
+	     ((join->max_allowed_join_cache_level+1)/2 == 2 ||
+              ((join->max_allowed_join_cache_level+1)/2 > 2 &&
+	       is_hash_join_key_no(tab->ref.key))) &&
+              (!tab->emb_sj_nest ||                     
+               join->allowed_semijoin_with_cache) && 
+              (!(tab->table->map & join->outer_join) ||
+               join->allowed_outer_join_with_cache);
+
       if (cond && !tmp && tab->quick)
       {						// Outer join
-        if (tab->type != JT_ALL)
+        if (tab->type != JT_ALL && !is_hj)
         {
           /*
             Don't use the quick method
@@ -6917,7 +8145,9 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
       if (tmp || !cond || tab->type == JT_REF || tab->type == JT_REF_OR_NULL ||
           tab->type == JT_EQ_REF || first_inner_tab)
       {
-        DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY););
+        DBUG_EXECUTE("where",print_where(tmp, 
+                                         tab->table? tab->table->alias.c_ptr() :"sjm-nest",
+                                         QT_ORDINARY););
 	SQL_SELECT *sel= tab->select= ((SQL_SELECT*)
                                        thd->memdup((uchar*) select,
                                                    sizeof(*select)));
@@ -6941,17 +8171,22 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
           tab->set_select_cond(tmp, __LINE__);
           /* Push condition to storage engine if this is enabled
              and the condition is not guarded */
-	  if (tab->table && (thd->variables.optimizer_switch &
-                             OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN) &&
-              !first_inner_tab)
+          if (tab->table)
           {
-            COND *push_cond= 
-              make_cond_for_table(tmp, current_map, current_map, FALSE);
-            if (push_cond)
+            tab->table->file->pushed_cond= NULL;
+            if ((thd->variables.optimizer_switch &
+                               OPTIMIZER_SWITCH_ENGINE_CONDITION_PUSHDOWN) &&
+                !first_inner_tab)
             {
-              /* Push condition to handler */
-              if (!tab->table->file->cond_push(push_cond))
-                tab->table->file->pushed_cond= push_cond;
+              COND *push_cond= 
+              make_cond_for_table(thd, tmp, current_map, current_map,
+                                  MAX_TABLES, FALSE, FALSE);
+              if (push_cond)
+              {
+                /* Push condition to handler */
+                if (!tab->table->file->cond_push(push_cond))
+                  tab->table->file->pushed_cond= push_cond;
+              }
             }
           }
         }
@@ -6962,15 +8197,19 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         }
 
 	sel->head=tab->table;
-        DBUG_EXECUTE("where",print_where(tmp,tab->table->alias, QT_ORDINARY););
+        DBUG_EXECUTE("where",
+                     print_where(tmp, 
+                                 tab->table ? tab->table->alias.c_ptr() :
+                                   "(sjm-nest)",
+                                 QT_ORDINARY););
 	if (tab->quick)
 	{
 	  /* Use quick key read if it's a constant and it's not used
 	     with key reading */
-          if (tab->needed_reg.is_clear_all() && tab->type != JT_EQ_REF &&
+          if ((tab->needed_reg.is_clear_all() && tab->type != JT_EQ_REF &&
               tab->type != JT_FT &&
               ((tab->type != JT_CONST && tab->type != JT_REF) ||
-               (uint)tab->ref.key == tab->quick->index))
+               (uint) tab->ref.key == tab->quick->index)) || is_hj)
           {
             DBUG_ASSERT(tab->quick->is_valid());
 	    sel->quick=tab->quick;		// Use value from get_quick_...
@@ -6983,7 +8222,7 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	  }
 	  tab->quick=0;
 	}
-	uint ref_key=(uint) sel->head->reginfo.join_tab->ref.key+1;
+	uint ref_key= sel->head? (uint) sel->head->reginfo.join_tab->ref.key+1 : 0;
 	if (i == join->const_tables && ref_key)
 	{
 	  if (!tab->const_keys.is_clear_all() &&
@@ -7002,12 +8241,12 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	    the index if we are using limit and this is the first table
 	  */
 
-	  if ((cond &&
-               (!tab->keys.is_subset(tab->const_keys) && i > 0)) ||
-	      (!tab->const_keys.is_clear_all() && i == join->const_tables &&
-	       join->unit->select_limit_cnt <
-	       join->best_positions[i].records_read &&
-	       !(join->select_options & OPTION_FOUND_ROWS)))
+	  if (!tab->table->is_filled_at_execution() &&
+              ((cond && (!tab->keys.is_subset(tab->const_keys) && i > 0)) ||
+               (!tab->const_keys.is_clear_all() && i == join->const_tables &&
+                join->unit->select_limit_cnt <
+                join->best_positions[i].records_read &&
+                !(join->select_options & OPTION_FOUND_ROWS))))
 	  {
 	    /* Join with outer join condition */
 	    COND *orig_cond=sel->cond;
@@ -7024,7 +8263,8 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	      sel->cond->quick_fix_field();
 
 	    if (sel->test_quick_select(thd, tab->keys,
-				       used_tables & ~ current_map,
+				       ((used_tables & ~ current_map) |
+                                        OUTER_REF_TABLE_BIT),
 				       (join->select_options &
 					OPTION_FOUND_ROWS ?
 					HA_POS_ERROR :
@@ -7073,24 +8313,14 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 	  if (i != join->const_tables && tab->use_quick != 2 &&
               !tab->first_inner)
 	  {					/* Read with cache */
-	    if (cond &&
-                (tmp=make_cond_for_table(cond,
-					 join->const_table_map |
-					 current_map,
-					 current_map, FALSE)))
-	    {
-              DBUG_EXECUTE("where",print_where(tmp,"cache", QT_ORDINARY););
-	      tab->cache_select=(SQL_SELECT*)
-		thd->memdup((uchar*) sel, sizeof(SQL_SELECT));
-	      tab->cache_select->cond=tmp;
-	      tab->cache_select->read_tables=join->const_table_map;
-	    }
-	  }
+            if (tab->make_scan_filter())
+              DBUG_RETURN(1);
+          }
 	}
       }
       
       /* 
-        Push down conditions from all on expressions.
+        Push down conditions from all ON expressions.
         Each of these conditions are guarded by a variable
         that turns if off just before null complemented row for
         outer joins is formed. Thus, the condition from an
@@ -7098,16 +8328,32 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         the null complemented row.
       */ 
 
-      /* First push down constant conditions from on expressions */
-      for (JOIN_TAB *join_tab= join->join_tab+join->const_tables;
-           join_tab < join->join_tab+join->tables ; join_tab++)
+      /* 
+        First push down constant conditions from ON expressions. 
+         - Each pushed-down condition is wrapped into trigger which is 
+           enabled only for non-NULL-complemented record
+         - The condition is attached to the first_inner_table.
+        
+        With regards to join nests:
+         - if we start at top level, don't walk into nests
+         - if we start inside a nest, stay within that nest.
+      */
+      JOIN_TAB *start_from= tab->bush_root_tab? 
+                               tab->bush_root_tab->bush_children->start : 
+                               join->join_tab + join->const_tables;
+      JOIN_TAB *end_with= tab->bush_root_tab? 
+                               tab->bush_root_tab->bush_children->end : 
+                               join->join_tab + join->top_join_tab_count;
+      for (JOIN_TAB *join_tab= start_from;
+           join_tab != end_with;
+           join_tab++)
       {
         if (*join_tab->on_expr_ref)
         {
           JOIN_TAB *cond_tab= join_tab->first_inner;
-          COND *tmp= make_cond_for_table(*join_tab->on_expr_ref,
+          COND *tmp= make_cond_for_table(thd, *join_tab->on_expr_ref,
                                          join->const_table_map,
-                                         (table_map) 0, FALSE);
+                                         (table_map) 0, MAX_TABLES, FALSE, FALSE);
           if (!tmp)
             continue;
           tmp= new Item_func_trig_cond(tmp, &cond_tab->not_null_compl);
@@ -7125,10 +8371,16 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
         }       
       }
 
-      /* Push down non-constant conditions from on expressions */
+
+      /* Push down non-constant conditions from ON expressions */
       JOIN_TAB *last_tab= tab;
+
+      /*
+        while we're inside of an outer join and last_tab is 
+        the last of its tables ... 
+      */
       while (first_inner_tab && first_inner_tab->last_inner == last_tab)
-      {  
+      { 
         /* 
           Table tab is the last inner table of an outer join.
           An on expression is always attached to it.
@@ -7137,15 +8389,29 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 
         table_map used_tables2= (join->const_table_map |
                                  OUTER_REF_TABLE_BIT | RAND_TABLE_BIT);
-	for (tab= join->join_tab+join->const_tables; tab <= last_tab ; tab++)
+
+        start_from= tab->bush_root_tab? 
+                      tab->bush_root_tab->bush_children->start : 
+                      join->join_tab + join->const_tables;
+        for (JOIN_TAB *tab= start_from; tab <= last_tab; tab++)
         {
+          DBUG_ASSERT(tab->table);
           current_map= tab->table->map;
           used_tables2|= current_map;
-          COND *tmp_cond= make_cond_for_table(on_expr, used_tables2,
-                                              current_map, FALSE);
+          /*
+            psergey: have put the MAX_TABLES below. It's bad, will need to fix it.
+          */
+          COND *tmp_cond= make_cond_for_table(thd, on_expr, used_tables2,
+                                              current_map, /*(tab - first_tab)*/ MAX_TABLES,
+					      FALSE, FALSE);
+          if (tab == first_inner_tab && tab->on_precond)
+            add_cond_and_fix(thd, &tmp_cond, tab->on_precond);
           if (tmp_cond)
           {
             JOIN_TAB *cond_tab= tab < first_inner_tab ? first_inner_tab : tab;
+            Item **sel_cond_ref= tab < first_inner_tab ?
+                                   &first_inner_tab->on_precond :
+                                   &tab->select_cond;
             /*
               First add the guards for match variables of
               all embedding outer join operations.
@@ -7168,51 +8434,196 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
               tmp_cond->quick_fix_field();
 	    /* Add the predicate to other pushed down predicates */
             DBUG_PRINT("info", ("Item_cond_and"));
-            cond_tab->select_cond= !cond_tab->select_cond ? tmp_cond :
-	                          new Item_cond_and(cond_tab->select_cond,
-                                                    tmp_cond);
+            *sel_cond_ref= !(*sel_cond_ref) ? 
+                             tmp_cond :
+                             new Item_cond_and(*sel_cond_ref, tmp_cond);
             DBUG_PRINT("info", ("Item_cond_and 0x%lx",
-                                (ulong)cond_tab->select_cond));
-            if (!cond_tab->select_cond)
-	      DBUG_RETURN(1);
-            cond_tab->select_cond->quick_fix_field();
-            cond_tab->select_cond->update_used_tables();
+                                (ulong)(*sel_cond_ref)));
+            if (!(*sel_cond_ref))
+              DBUG_RETURN(1);
+            (*sel_cond_ref)->quick_fix_field();
+            (*sel_cond_ref)->update_used_tables();
             if (cond_tab->select)
               cond_tab->select->cond= cond_tab->select_cond;
-          }              
+          }
         }
         first_inner_tab= first_inner_tab->first_upper;       
       }
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+static
+uint get_next_field_for_derived_key(uchar *arg)
+{
+  KEYUSE *keyuse= *(KEYUSE **) arg;
+  if (!keyuse)
+    return (uint) (-1);
+  TABLE *table= keyuse->table;
+  uint key= keyuse->key;
+  uint fldno= keyuse->keypart; 
+  uint keypart= keyuse->keypart_map == (key_part_map) 1 ?
+                                         0 : (keyuse-1)->keypart+1;
+  for ( ; 
+        keyuse->table == table && keyuse->key == key && keyuse->keypart == fldno;
+        keyuse++)
+    keyuse->keypart= keypart;
+  if (keyuse->key != key)
+    keyuse= 0;
+  *((KEYUSE **) arg)= keyuse;
+  return fldno;
+}
+
+
+static 
+bool generate_derived_keys_for_table(KEYUSE *keyuse, uint count, uint keys)
+{
+  TABLE *table= keyuse->table;
+  if (table->alloc_keys(keys))
+    return TRUE;
+  uint keyno= 0;
+  KEYUSE *first_keyuse= keyuse;
+  uint prev_part= keyuse->keypart;
+  uint parts= 0;
+  uint i= 0;
+
+  for ( ; i < count && keyno < keys; )
+  {
+    do
+    {
+      keyuse->key= keyno;
+      keyuse->keypart_map= (key_part_map) (1 << parts);     
+      keyuse++;
+      i++;
+    } 
+    while (i < count && keyuse->used_tables == first_keyuse->used_tables &&
+           keyuse->keypart == prev_part);
+    parts++;
+    if (i < count && keyuse->used_tables == first_keyuse->used_tables)
+    {
+      prev_part= keyuse->keypart;
+    }
+    else
+    {
+      if (table->add_tmp_key(keyno, parts, 
+                             get_next_field_for_derived_key, 
+                             (uchar *) &first_keyuse,
+                             FALSE))
+        return TRUE;
+      table->reginfo.join_tab->keys.set_bit(keyno);
+      first_keyuse= keyuse;
+      keyno++;
+      parts= 0;
+      prev_part= keyuse->keypart;
+    }
+  }             
+
+  return FALSE;
+}
+   
 
-      if (save_used_tables && !(used_tables & 
-                                ~(tab->emb_sj_nest->sj_inner_tables |
-                                  join->const_table_map | PSEUDO_TABLE_BITS)))
+static
+bool generate_derived_keys(DYNAMIC_ARRAY *keyuse_array)
+{
+  KEYUSE *keyuse= dynamic_element(keyuse_array, 0, KEYUSE*);
+  uint elements= keyuse_array->elements;
+  TABLE *prev_table= 0;
+  for (uint i= 0; i < elements; i++, keyuse++)
+  {
+    if (!keyuse->table)
+      break;
+    KEYUSE *first_table_keyuse= NULL;
+    table_map last_used_tables= 0;
+    uint count= 0;
+    uint keys= 0;
+    TABLE_LIST *derived= NULL;
+    if (keyuse->table != prev_table)
+      derived= keyuse->table->pos_in_table_list;
+    while (derived && derived->is_materialized_derived() && 
+           keyuse->key == MAX_KEY)
+    {
+      if (keyuse->table != prev_table)
       {
-        /*
-          We have reached the end of semi join nest. That is, the join order
-          looks like this:
-
-           outer_tbl1 SJ-Materialize(inner_tbl1 ... inner_tblN) outer_tbl ...
-                                                               ^
-                                                                \-we're here
-          At this point, we need to produce two conditions
-           - A condition that can be checked when we have all of the sj-inner
-             tables (inner_tbl1 ... inner_tblN). This will be used while doing
-             materialization.
-           - A condition that can be checked when we have all of the tables
-             in the prefix (both inner and outer).
-        */
-        tab->emb_sj_nest->sj_mat_info->join_cond= 
-          cond ?
-             make_cond_after_sjm(cond, cond, save_used_tables, used_tables):
-            NULL;
-        used_tables= save_used_tables | used_tables;
-        save_used_tables= 0;
+        prev_table= keyuse->table;
+        first_table_keyuse= keyuse;
+        last_used_tables= keyuse->used_tables;
+        count= 0;
+        keys= 0;
+      }
+      else if (keyuse->used_tables != last_used_tables)
+      {
+        keys++;
+        last_used_tables= keyuse->used_tables;
+      }
+      count++;
+      keyuse++;
+      if (keyuse->table != prev_table)
+      {
+        if (generate_derived_keys_for_table(first_table_keyuse, count, ++keys))
+          return TRUE;
+        keyuse--;
+	derived= NULL;
       }
-
     }
   }
-  DBUG_RETURN(0);
+  return FALSE;
+}
+
+
+/*
+  @brief
+  Drops unused keys for each materialized derived table/view
+
+  @details
+  For materialized derived tables only ref access can be used, it employs
+  only one index, thus we don't need the rest. For each materialized derived
+  table/view call TABLE::use_index to save one index chosen by the optimizer
+  and free others. No key is chosen then all keys will be dropped.
+*/
+
+void JOIN::drop_unused_derived_keys()
+{
+  JOIN_TAB *tab;
+  for (tab= first_linear_tab(this, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(this, tab, WITHOUT_BUSH_ROOTS))
+  {
+    
+    TABLE *table=tab->table;
+    if (!table)
+      continue;
+    if (!table->pos_in_table_list->is_materialized_derived() ||
+        table->max_keys <= 1)
+      continue;
+    table->use_index(tab->ref.key);
+    if (table->s->keys)
+      tab->ref.key= 0;
+  }
+}
+
+
+/*
+  Evaluate the bitmap of used tables for items from the select list
+*/
+
+inline void JOIN::eval_select_list_used_tables()
+{
+  select_list_used_tables= 0;
+  Item *item;
+  List_iterator_fast<Item> it(fields_list);
+  while ((item= it++))
+  {
+    select_list_used_tables|= item->used_tables();
+  }
+  Item_outer_ref *ref;
+  List_iterator_fast<Item_outer_ref> ref_it(select_lex->inner_refs_list);
+  while ((ref= ref_it++))
+  {
+    item= ref->outer_ref;
+    select_list_used_tables|= item->used_tables();
+  }
 }
 
 
@@ -7240,11 +8651,17 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
 
 static uint make_join_orderinfo(JOIN *join)
 {
+  /*
+    This function needs to be fixed to take into account that we now have SJM
+    nests.
+  */
+  DBUG_ASSERT(0);
+
   JOIN_TAB *tab;
   if (join->need_tmp)
-    return join->tables;
+    return join->table_count;
   tab= join->get_sort_by_join_tab();
-  return tab ? tab-join->join_tab : join->tables;
+  return tab ? tab-join->join_tab : join->table_count;
 }
 
 /*
@@ -7268,17 +8685,37 @@ void set_join_cache_denial(JOIN_TAB *join_tab)
 {
   if (join_tab->cache)
   {
+    /* 
+      If there is a previous cache linked to this cache through the
+      next_cache pointer: remove the link. 
+    */
+    if (join_tab->cache->prev_cache)
+      join_tab->cache->prev_cache->next_cache= 0;
+    /*
+      No need to do the same for next_cache since cache denial is done
+      backwards starting from the latest cache in the linked list (see
+      revise_cache_usage()).
+    */
+    DBUG_ASSERT(!join_tab->cache->next_cache);
+
     join_tab->cache->free();
     join_tab->cache= 0;
   }
   if (join_tab->use_join_cache)
   {
     join_tab->use_join_cache= FALSE;
+    join_tab->used_join_cache_level= 0;
     /*
       It could be only sub_select(). It could not be sub_seject_sjm because we
       don't do join buffering for the first table in sjm nest. 
     */
     join_tab[-1].next_select= sub_select;
+    if (join_tab->type == JT_REF && join_tab->is_ref_for_hash_join())
+    {
+      join_tab->type= JT_ALL;
+      join_tab->ref.key_parts= 0;
+    }
+    join_tab->join->return_tab= join_tab;
   }
 }
 
@@ -7398,7 +8835,7 @@ void revise_cache_usage(JOIN_TAB *join_tab)
   SYNOPSIS
     end_sj_materialize()
       join            The join 
-      join_tab        Last join table
+      join_tab        Points to right after the last join_tab in materialization bush
       end_of_records  FALSE <=> This call is made to pass another record 
                                 combination
                       TRUE  <=> EOF (no action)
@@ -7416,7 +8853,7 @@ void revise_cache_usage(JOIN_TAB *join_tab)
     NESTED_LOOP_ERROR
 */
 
-static enum_nested_loop_state 
+enum_nested_loop_state 
 end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
 {
   int error;
@@ -7437,7 +8874,7 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
     fill_record(thd, table->field, sjm->sjm_table_cols, TRUE, FALSE);
     if (thd->is_error())
       DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
-    if ((error= table->file->ha_write_row(table->record[0])))
+    if ((error= table->file->ha_write_tmp_row(table->record[0])))
     {
       /* create_myisam_from_heap will generate error if needed */
       if (table->file->is_fatal_error(error, HA_CHECK_DUP) &&
@@ -7457,11 +8894,9 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
   SYNOPSIS
     check_join_cache_usage()
       tab                 joined table to check join buffer usage for
-      join                join for which the check is performed
       options             options of the join
       no_jbuf_after       don't use join buffering after table with this number
-      icp_other_tables_ok OUT TRUE if condition pushdown supports
-                          other tables presence
+      prev_tab            previous join table
 
   DESCRIPTION
     The function finds out whether the table 'tab' can be joined using a join
@@ -7473,24 +8908,65 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
     depend on:
       - the access method to access rows of the joined table
       - whether the join table is an inner table of an outer join or semi-join
+      - whether the optimizer switches
+          outer_join_with_cache, semijoin_with_cache, join_cache_incremental,
+          join_cache_hashed, join_cache_bka,
+        are set on or off
       - the join cache level set for the query
       - the join 'options'.
+
     In any case join buffer is not used if the number of the joined table is
     greater than 'no_jbuf_after'. It's also never used if the value of
     join_cache_level is equal to 0.
-    The other valid settings of join_cache_level lay in the interval 1..8.
-    If join_cache_level==1|2 then join buffer is used only for inner joins
-    with 'JT_ALL' access method.  
-    If join_cache_level==3|4 then join buffer is used for any join operation
-    (inner join, outer join, semi-join) with 'JT_ALL' access method.
-    If 'JT_ALL' access method is used to read rows of the joined table then
-    always a JOIN_CACHE_BNL object is employed.
+    If the optimizer switch outer_join_with_cache is off no join buffer is
+    used for outer join operations.
+    If the optimizer switch semijoin_with_cache is off no join buffer is used
+    for semi-join operations.
+    If the optimizer switch join_cache_incremental is off no incremental join
+    buffers are used.
+    If the optimizer switch join_cache_hashed is off then the optimizer uses
+    neither BNLH algorithm, nor BKAH algorithm to perform join operations.
+
+    If the optimizer switch join_cache_bka is off then the optimizer uses
+    neither BKA algorithm, nor BKAH algorithm to perform join operation.
+    The valid settings for join_cache_level lay in the interval 0..8.
+    If it set to 0 no join buffers are used to perform join operations.
+    Currently we differentiate between join caches of 8 levels:
+      1 : non-incremental join cache used for BNL join algorithm
+      2 : incremental join cache used for BNL join algorithm
+      3 : non-incremental join cache used for BNLH join algorithm
+      4 : incremental join cache used for BNLH join algorithm
+      5 : non-incremental join cache used for BKA join algorithm
+      6 : incremental join cache used for BKA join algorithm 
+      7 : non-incremental join cache used for BKAH join algorithm 
+      8 : incremental join cache used for BKAH join algorithm
+    If the value of join_cache_level is set to n then no join caches of
+    levels higher than n can be employed.
+
+    If the optimizer switches outer_join_with_cache, semijoin_with_cache,
+    join_cache_incremental, join_cache_hashed, join_cache_bka are all on
+    the following rules are applied.
+    If join_cache_level==1|2 then join buffer is used for inner joins, outer
+    joins and semi-joins with 'JT_ALL' access method. In this case a
+    JOIN_CACHE_BNL object is employed.
+    If join_cache_level==3|4 and then join buffer is used for a join operation
+    (inner join, outer join, semi-join) with 'JT_REF'/'JT_EQREF' access method
+    then a JOIN_CACHE_BNLH object is employed. 
     If an index is used to access rows of the joined table and the value of
     join_cache_level==5|6 then a JOIN_CACHE_BKA object is employed. 
     If an index is used to access rows of the joined table and the value of
-    join_cache_level==7|8 then a JOIN_CACHE_BKA_UNIQUE object is employed. 
+    join_cache_level==7|8 then a JOIN_CACHE_BKAH object is employed. 
     If the value of join_cache_level is odd then creation of a non-linked 
     join cache is forced.
+
+    Currently for any join operation a join cache of the  level of the
+    highest allowed and applicable level is used.
+    For example, if join_cache_level is set to 6 and the optimizer switch
+    join_cache_bka is off, while the optimizer switch join_cache_hashed is
+    on then for any inner join operation with JT_REF/JT_EQREF access method
+    to the joined table the BNLH join algorithm will be used, while for
+    the table accessed by the JT_ALL methods the BNL algorithm will be used.
+
     If the function decides that a join buffer can be used to join the table
     'tab' then it sets the value of tab->use_join_buffer to TRUE and assigns
     the selected join cache object to the field 'cache' of the previous
@@ -7498,19 +8974,31 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
     If the function creates a join cache object it tries to initialize it. The
     failure to do this results in an invocation of the function that destructs
     the created object.
+    If the function decides that but some reasons no join buffer can be used
+    for a table it calls the function revise_cache_usage that checks
+    whether join cache should be denied for some previous tables. In this case
+    a pointer to the first table for which join cache usage has been denied
+    is passed in join->return_val (see the function set_join_cache_denial).
+    
+    The functions changes the value the fields tab->icp_other_tables_ok and
+    tab->idx_cond_fact_out to FALSE if the chosen join cache algorithm 
+    requires it.
  
   NOTES
     An inner table of a nested outer join or a nested semi-join can be currently
     joined only when a linked cache object is employed. In these cases setting
-    join cache level to an odd number results in denial of usage of any join
+    join_cache_incremental to 'off' results in denial of usage of any join
     buffer when joining the table.
     For a nested outer join/semi-join, currently, we either use join buffers for
     all inner tables or for none of them. 
     Some engines (e.g. Falcon) currently allow to use only a join cache
-    of the type JOIN_CACHE_BKA_UNIQUE when the joined table is accessed through
+    of the type JOIN_CACHE_BKAH when the joined table is accessed through
     an index. For these engines setting the value of join_cache_level to 5 or 6
     results in that no join buffer is used to join the table. 
   
+  RETURN VALUE
+    cache level if cache is used, otherwise returns 0
+
   TODO
     Support BKA inside SJ-Materialization nests. When doing this, we'll need
     to only store sj-inner tables in the join buffer.
@@ -7534,56 +9022,76 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
           first_tab= join->join_tab + first_sjm_table;
         }
 #endif
-
-  RETURN
-
-    cache level if cache is used, otherwise returns 0
 */
 
 static
 uint check_join_cache_usage(JOIN_TAB *tab,
-                            JOIN *join, ulonglong options,
+                            ulonglong options,
                             uint no_jbuf_after,
-                            bool *icp_other_tables_ok)
+                            uint table_index,
+                            JOIN_TAB *prev_tab)
 {
-  uint flags;
   COST_VECT cost;
-  ha_rows rows;
+  uint flags= 0;
+  ha_rows rows= 0;
   uint bufsz= 4096;
   JOIN_CACHE *prev_cache=0;
-  uint cache_level= join->thd->variables.join_cache_level;
-  bool force_unlinked_cache= test(cache_level & 1);
-  uint i= tab - join->join_tab;
+  JOIN *join= tab->join;
+  uint cache_level= tab->used_join_cache_level;
+  bool force_unlinked_cache=
+         !(join->allowed_join_cache_types & JOIN_CACHE_INCREMENTAL_BIT);
+  bool no_hashed_cache=
+         !(join->allowed_join_cache_types & JOIN_CACHE_HASHED_BIT);
+  bool no_bka_cache= 
+         !(join->allowed_join_cache_types & JOIN_CACHE_BKA_BIT);
 
-  *icp_other_tables_ok= TRUE;
-  if (cache_level == 0 || i == join->const_tables)
+  join->return_tab= 0;
+
+  /*
+    Don't use join cache if @@join_cache_level==0 or this table is the first
+    one join suborder (either at top level or inside a bush)
+  */
+  if (cache_level == 0 || !prev_tab)
     return 0;
 
+  if (force_unlinked_cache && (cache_level%2 == 0))
+    cache_level--;
+
   if (options & SELECT_NO_JOIN_CACHE)
     goto no_join_cache;
-  /* 
-    psergey-todo: why the below when execution code seems to handle the
-    "range checked for each record" case?
-  */
+
   if (tab->use_quick == 2)
     goto no_join_cache;
+
+  if (tab->is_inner_table_of_semi_join_with_first_match() &&
+      !join->allowed_semijoin_with_cache)
+    goto no_join_cache;
+  if (tab->is_inner_table_of_outer_join() &&
+      !join->allowed_outer_join_with_cache)
+    goto no_join_cache;
+
   /*
     Non-linked join buffers can't guarantee one match
   */
-   if (force_unlinked_cache && 
-       (!tab->type == JT_ALL || cache_level <= 4) && 
-       ((tab->is_inner_table_of_semi_join_with_first_match() &&
-         !tab->is_single_inner_of_semi_join_with_first_match()) ||
-        (tab->is_inner_table_of_outer_join() &&
-         !tab->is_single_inner_of_outer_join())))
+  if (tab->is_nested_inner())
+  {
+    if (force_unlinked_cache || cache_level == 1)
+      goto no_join_cache;
+    if (cache_level & 1)
+      cache_level--;
+  }
+    
+  /*
+    Don't use join buffering if we're dictated not to by no_jbuf_after
+    (This is not meaningfully used currently)
+  */
+  if (table_index > no_jbuf_after)
     goto no_join_cache;
-
+  
   /*
-    Don't use join buffering if we're dictated not to by no_jbuf_after (this
-    ...)
+    TODO: BNL join buffer should be perfectly ok with tab->bush_children.
   */
-  if (!(i <= no_jbuf_after) || tab->loosescan_match_tab || 
-      sj_is_materialize_strategy(join->best_positions[i].sj_strategy))
+  if (tab->loosescan_match_tab || tab->bush_children)
     goto no_join_cache;
 
   for (JOIN_TAB *first_inner= tab->first_inner; first_inner;
@@ -7595,7 +9103,7 @@ uint check_join_cache_usage(JOIN_TAB *tab,
   if (tab->first_sj_inner_tab && tab->first_sj_inner_tab != tab &&
       !tab->first_sj_inner_tab->use_join_cache)
     goto no_join_cache;
-  if (!tab[-1].use_join_cache)
+  if (!prev_tab->use_join_cache)
   {
     /* 
       Check whether table tab and the previous one belong to the same nest of
@@ -7616,52 +9124,191 @@ uint check_join_cache_usage(JOIN_TAB *tab,
       goto no_join_cache; 
   }       
 
-  if (!force_unlinked_cache)
-    prev_cache= tab[-1].cache;
+  prev_cache= prev_tab->cache;
 
   switch (tab->type) {
   case JT_ALL:
-    if (cache_level <= 2 && (tab->first_inner || tab->first_sj_inner_tab))
-      goto no_join_cache;
-    if ((options & SELECT_DESCRIBE) ||
-        (((tab->cache= new JOIN_CACHE_BNL(join, tab, prev_cache))) &&
-         !tab->cache->init()))
+    if (cache_level == 1)
+      prev_cache= 0;
+    if ((tab->cache= new JOIN_CACHE_BNL(join, tab, prev_cache)) &&
+        ((options & SELECT_DESCRIBE) || !tab->cache->init()))
     {
-      *icp_other_tables_ok= FALSE;
-      return cache_level;
+      tab->icp_other_tables_ok= FALSE;
+      return (2-test(!prev_cache));
     }
     goto no_join_cache;
   case JT_SYSTEM:
   case JT_CONST:
   case JT_REF:
   case JT_EQ_REF:
-    if (cache_level <= 4)
-      return 0;
-    flags= HA_MRR_NO_NULL_ENDPOINTS;
-    if (tab->table->covering_keys.is_set(tab->ref.key))
-      flags|= HA_MRR_INDEX_ONLY;
-    rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20,
-                                                  &bufsz, &flags, &cost);
-    if ((rows != HA_POS_ERROR) && !(flags & HA_MRR_USE_DEFAULT_IMPL) &&
-        (!(flags & HA_MRR_NO_ASSOCIATION) || cache_level > 6) &&
-        ((options & SELECT_DESCRIBE) ||
-         (((cache_level <= 6 && 
-           (tab->cache= new JOIN_CACHE_BKA(join, tab, flags, prev_cache))) ||
-	  (cache_level > 6 &&  
-           (tab->cache= new JOIN_CACHE_BKA_UNIQUE(join, tab, flags, prev_cache)))
-           ) && !tab->cache->init())))
-      return cache_level;
+    if (cache_level <=2 || (no_hashed_cache && no_bka_cache))
+      goto no_join_cache;
+    if (!tab->is_ref_for_hash_join())
+    {
+      flags= HA_MRR_NO_NULL_ENDPOINTS | HA_MRR_SINGLE_POINT;
+      if (tab->table->covering_keys.is_set(tab->ref.key))
+        flags|= HA_MRR_INDEX_ONLY;
+      rows= tab->table->file->multi_range_read_info(tab->ref.key, 10, 20,
+                                                    tab->ref.key_parts,
+                                                    &bufsz, &flags, &cost);
+    }
+
+    if ((cache_level <=4 && !no_hashed_cache) || no_bka_cache ||
+        tab->is_ref_for_hash_join() ||
+	((flags & HA_MRR_NO_ASSOCIATION) && cache_level <=6))
+    {
+      if (!tab->hash_join_is_possible() ||
+          tab->make_scan_filter())
+        goto no_join_cache;
+      if (cache_level == 3)
+        prev_cache= 0;
+      if ((tab->cache= new JOIN_CACHE_BNLH(join, tab, prev_cache)) &&
+          ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+      {
+        tab->icp_other_tables_ok= FALSE;        
+        return (4-test(!prev_cache));
+      }
+      goto no_join_cache;
+    }
+    if (cache_level > 4 && no_bka_cache)
+      goto no_join_cache;
+    
+    if ((flags & HA_MRR_NO_ASSOCIATION) &&
+	(cache_level <= 6 || no_hashed_cache))
+      goto no_join_cache;
+
+    if ((rows != HA_POS_ERROR) && !(flags & HA_MRR_USE_DEFAULT_IMPL))
+    {
+      if (cache_level <= 6 || no_hashed_cache)
+      {
+        if (cache_level == 5)
+          prev_cache= 0;
+        if ((tab->cache= new JOIN_CACHE_BKA(join, tab, flags, prev_cache)) &&
+            ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+          return (6-test(!prev_cache));
+        goto no_join_cache;
+      }
+      else
+      {
+        if (cache_level == 7)
+          prev_cache= 0;
+        if ((tab->cache= new JOIN_CACHE_BKAH(join, tab, flags, prev_cache)) &&
+            ((options & SELECT_DESCRIBE) || !tab->cache->init()))
+	{
+         tab->idx_cond_fact_out= FALSE;
+          return (8-test(!prev_cache));
+        }
+        goto no_join_cache;
+      }
+    }
     goto no_join_cache;
   default : ;
   }
 
 no_join_cache:
-  if (cache_level>2)
-    revise_cache_usage(tab); 
+  if (tab->type != JT_ALL && tab->is_ref_for_hash_join())
+    tab->type= JT_ALL;
+  revise_cache_usage(tab); 
   return 0;
 }
 
 
+/* 
+  Check whether join buffers can be used to join tables of a join   
+
+  SYNOPSIS
+    check_join_cache_usage()
+      join                join whose tables are to be checked             
+      options             options of the join
+      no_jbuf_after       don't use join buffering after table with this number
+                          (The tables are assumed to be numbered in
+                          first_linear_tab(join, WITHOUT_CONST_TABLES),
+                          next_linear_tab(join, WITH_CONST_TABLES) order).
+
+  DESCRIPTION
+    For each table after the first non-constant table the function checks
+    whether the table can be joined using a join buffer. If the function decides
+    that a join buffer can be employed then it selects the most appropriate join
+    cache object that contains this join buffer whose level is not greater
+    than join_cache_level set for the join. To make this check the function
+    calls the function check_join_cache_usage for every non-constant table.
+
+  NOTES
+    In some situations (e.g. for nested outer joins, for nested semi-joins) only
+    incremental buffers can be used. If it turns out that for some inner table
+    no join buffer can be used then any inner table of an outer/semi-join nest
+    cannot use join buffer. In the case when already chosen buffer must be
+    denied for a table the function recalls check_join_cache_usage()
+    starting from this table. The pointer to the table from which the check
+    has to be restarted is returned in join->return_val (see the description
+    of check_join_cache_usage).
+*/
+
+void check_join_cache_usage_for_tables(JOIN *join, ulonglong options,
+                                       uint no_jbuf_after)
+{
+  JOIN_TAB *tab;
+  JOIN_TAB *prev_tab;
+
+  for (tab= first_linear_tab(join, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
+  {
+    tab->used_join_cache_level= join->max_allowed_join_cache_level;  
+  }
+  
+  uint idx= join->const_tables;
+  for (tab= first_linear_tab(join, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
+  {
+restart:
+    tab->icp_other_tables_ok= TRUE;
+    tab->idx_cond_fact_out= TRUE;
+    
+    /* 
+      Check if we have a preceding join_tab, as something that will feed us
+      records that we could buffer. We don't have it, if 
+       - this is the first non-const table in the join order,
+       - this is the first table inside an SJM nest.
+    */
+    prev_tab= tab - 1;
+    if (tab == join->join_tab + join->const_tables ||
+        (tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab))
+      prev_tab= NULL;
+
+    switch (tab->type) {
+    case JT_SYSTEM:
+    case JT_CONST:
+    case JT_EQ_REF:
+    case JT_REF:
+    case JT_REF_OR_NULL:
+    case JT_ALL:
+      tab->used_join_cache_level= check_join_cache_usage(tab, options,
+                                                         no_jbuf_after,
+                                                         idx,
+                                                         prev_tab);
+      tab->use_join_cache= test(tab->used_join_cache_level);
+      /*
+        psergey-merge: todo: raise the question that this is really stupid that
+        we can first allocate a join buffer, then decide not to use it and free
+        it.
+      */
+      if (join->return_tab)
+      {
+        tab= join->return_tab;
+        goto restart;
+      }
+      break; 
+    default:
+      tab->used_join_cache_level= 0;
+    }
+    if (!tab->bush_children)
+      idx++;
+  }
+}
+
+
 /*
   Plan refinement stage: do various setup things for the executor
 
@@ -7687,29 +9334,78 @@ no_join_cache:
 static bool
 make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 {
+  JOIN_TAB *tab;
   uint i;
-  bool statistics= test(!(join->select_options & SELECT_DESCRIBE));
-  bool sorted= 1;
-  uint first_sjm_table= MAX_TABLES;
-  uint last_sjm_table= MAX_TABLES;
   DBUG_ENTER("make_join_readinfo");
 
+  bool statistics= test(!(join->select_options & SELECT_DESCRIBE));
+  bool sorted= 1;
 
   if (!join->select_lex->sj_nests.is_empty() &&
       setup_semijoin_dups_elimination(join, options, no_jbuf_after))
     DBUG_RETURN(TRUE); /* purecov: inspected */
+  
+  /* For const tables, set partial_join_cardinality to 1. */
+  for (tab= join->join_tab; tab != join->join_tab + join->const_tables; tab++)
+    tab->partial_join_cardinality= 1; 
+
+  JOIN_TAB *prev_tab= NULL;
+  for (tab= first_linear_tab(join, WITHOUT_CONST_TABLES), i= join->const_tables; 
+       tab; 
+       prev_tab=tab, tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
+  {
+    /*
+      The approximation below for partial join cardinality is not good because
+        - it does not take into account some pushdown predicates
+        - it does not differentiate between inner joins, outer joins and
+        semi-joins.
+      Later it should be improved.
+    */
+
+    if (tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab)
+      prev_tab= NULL;
+    DBUG_ASSERT(tab->bush_children || tab->table == join->best_positions[i].table->table);
 
-  for (i=join->const_tables ; i < join->tables ; i++)
+    tab->partial_join_cardinality= join->best_positions[i].records_read *
+                                   (prev_tab? prev_tab->partial_join_cardinality : 1);
+    if (!tab->bush_children)
+      i++;
+  }
+ 
+  check_join_cache_usage_for_tables(join, options, no_jbuf_after);
+  
+  JOIN_TAB *first_tab;
+  for (tab= first_tab= first_linear_tab(join, WITHOUT_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
   {
-    JOIN_TAB *tab=join->join_tab+i;
+    if (tab->bush_children)
+    {
+      if (setup_sj_materialization_part2(tab))
+        return TRUE;
+    }
+
     TABLE *table=tab->table;
-    bool icp_other_tables_ok;
+    uint jcl= tab->used_join_cache_level;
     tab->read_record.table= table;
     tab->read_record.file=table->file;
     tab->read_record.unlock_row= rr_unlock_row;
-    tab->next_select=sub_select;		/* normal select */
     tab->sorted= sorted;
     sorted= 0;                                  // only first must be sorted
+    
+
+    /*
+      We should not set tab->next_select for the last table in the
+      SMJ-nest, as setup_sj_materialization() has already set it to
+      end_sj_materialize.
+    */
+    if (!(tab->bush_root_tab && 
+          tab->bush_root_tab->bush_children->end == tab + 1))
+    {
+      tab->next_select=sub_select;		/* normal select */
+    }
+
+
     if (tab->loosescan_match_tab)
     {
       if (!(tab->loosescan_buf= (uchar*)join->thd->alloc(tab->
@@ -7717,61 +9413,41 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
         return TRUE; /* purecov: inspected */
       tab->sorted= TRUE;
     }
-    if (sj_is_materialize_strategy(join->best_positions[i].sj_strategy))
-    {
-      /* This is a start of semi-join nest */
-      first_sjm_table= i;
-      last_sjm_table= i + join->best_positions[i].n_sj_tables;
-      if (i == join->const_tables)
-        join->first_select= sub_select_sjm;
-      else
-       tab[-1].next_select= sub_select_sjm;
-
-      if (setup_sj_materialization(tab))
-        return TRUE;
-    }
     table->status=STATUS_NO_RECORD;
     pick_table_access_method (tab);
 
+    if (jcl)
+       tab[-1].next_select=sub_select_cache;
+
+    if (tab->cache && tab->cache->get_join_alg() == JOIN_CACHE::BNLH_JOIN_ALG)
+      tab->type= JT_HASH;
+      
     switch (tab->type) {
     case JT_SYSTEM:				// Only happens with left join 
     case JT_CONST:				// Only happens with left join
       /* Only happens with outer joins */
       tab->read_first_record= tab->type == JT_SYSTEM ?
                                 join_read_system :join_read_const;
-      if (check_join_cache_usage(tab, join, options, no_jbuf_after,
-                                 &icp_other_tables_ok))
-      {
-        tab->use_join_cache= TRUE;
-        tab[-1].next_select=sub_select_cache;
-      }
-      else
       if (table->covering_keys.is_set(tab->ref.key) &&
           !table->no_keyread)
       {
         table->key_read=1;
         table->file->extra(HA_EXTRA_KEYREAD);
       }
-      else
-        push_index_cond(tab, tab->ref.key, icp_other_tables_ok);
+      else if (!jcl || jcl > 4) 
+        push_index_cond(tab, tab->ref.key);
         break;
     case JT_EQ_REF:
       tab->read_record.unlock_row= join_read_key_unlock_row;
       /* fall through */
-      if (check_join_cache_usage(tab, join, options, no_jbuf_after,
-                                 &icp_other_tables_ok))
-      {
-        tab->use_join_cache= TRUE;
-        tab[-1].next_select=sub_select_cache;
-      }
       if (table->covering_keys.is_set(tab->ref.key) &&
 	  !table->no_keyread)
       {
 	table->key_read=1;
 	table->file->extra(HA_EXTRA_KEYREAD);
       }
-      else
-        push_index_cond(tab, tab->ref.key, icp_other_tables_ok );
+      else if (!jcl || jcl > 4) 
+        push_index_cond(tab, tab->ref.key);
       break;
     case JT_REF_OR_NULL:
     case JT_REF:
@@ -7782,31 +9458,20 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
       }
       delete tab->quick;
       tab->quick=0;
-      if (check_join_cache_usage(tab, join, options, no_jbuf_after,
-                                 &icp_other_tables_ok))
-      {
-        tab->use_join_cache= TRUE;
-        tab[-1].next_select=sub_select_cache;
-      }
       if (table->covering_keys.is_set(tab->ref.key) &&
 	  !table->no_keyread)
         table->enable_keyread();
-      else
-        push_index_cond(tab, tab->ref.key, icp_other_tables_ok);
+      else if (!jcl || jcl > 4)
+        push_index_cond(tab, tab->ref.key);
       break;
     case JT_ALL:
+    case JT_HASH:
       /*
 	If previous table use cache
         If the incoming data set is already sorted don't use cache.
         Also don't use cache if this is the first table in semi-join
           materialization nest.
       */
-      if (check_join_cache_usage(tab, join, options, no_jbuf_after,
-                                 &icp_other_tables_ok))
-      {
-        tab->use_join_cache= TRUE;
-        tab[-1].next_select=sub_select_cache;
-      }
       /* These init changes read_record */
       if (tab->use_quick == 2)
       {
@@ -7817,8 +9482,9 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
       }
       else
       {
-	tab->read_first_record= join_init_read_record;
-	if (i == join->const_tables)
+        if (!tab->bush_children)
+          tab->read_first_record= join_init_read_record;
+	if (tab == first_tab)
 	{
 	  if (tab->select && tab->select->quick)
 	  {
@@ -7840,14 +9506,16 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 	  if (tab->select && tab->select->quick)
 	  {
 	    if (statistics)
-	      status_var_increment(join->thd->status_var.select_full_range_join_count);
+	      status_var_increment(join->thd->status_var.
+                                   select_full_range_join_count);
 	  }
 	  else
 	  {
 	    join->thd->server_status|=SERVER_QUERY_NO_INDEX_USED;
 	    if (statistics)
 	    {
-	      status_var_increment(join->thd->status_var.select_full_join_count);
+	      status_var_increment(join->thd->status_var.
+                                   select_full_join_count);
 	      join->thd->query_plan_flags|= QPLAN_FULL_JOIN;
 	    }
 	  }
@@ -7861,50 +9529,61 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 	  else if (!table->covering_keys.is_clear_all() &&
 		   !(tab->select && tab->select->quick))
 	  {					// Only read index tree
+#ifdef BAD_OPTIMIZATION
 	    /*
-            It has turned out that the below change, while speeding things
-            up for disk-bound loads, slows them down for cases when the data
-            is in disk cache (see BUG#35850):
-	    //  See bug #26447: "Using the clustered index for a table scan
-	    //  is always faster than using a secondary index".
+              It has turned out that the below change, while speeding things
+              up for disk-bound loads, slows them down for cases when the data
+              is in disk cache (see BUG#35850):
+              See bug #26447: "Using the clustered index for a table scan
+              is always faster than using a secondary index".
+            */
             if (table->s->primary_key != MAX_KEY &&
                 table->file->primary_key_is_clustered())
               tab->index= table->s->primary_key;
             else
-	    */
+#endif
               tab->index=find_shortest_key(table, & table->covering_keys);
 	    tab->read_first_record= join_read_first;
-	    tab->type=JT_NEXT;		// Read with index_first / index_next
+            /* Read with index_first / index_next */
+	    tab->type= tab->type == JT_ALL ? JT_NEXT : JT_HASH_NEXT;		
 	  }
 	}
         if (tab->select && tab->select->quick &&
             tab->select->quick->index != MAX_KEY && ! tab->table->key_read)
-          push_index_cond(tab, tab->select->quick->index, icp_other_tables_ok);
+          push_index_cond(tab, tab->select->quick->index);
       }
       break;
     case JT_FT:
       break;
+      /* purecov: begin deadcode */
     default:
-      DBUG_PRINT("error",("Table type %d found",tab->type)); /* purecov: deadcode */
-      break;					/* purecov: deadcode */
+      DBUG_PRINT("error",("Table type %d found",tab->type));
+      break;
     case JT_UNKNOWN:
     case JT_MAYBE_REF:
-      abort();					/* purecov: deadcode */
+      abort();
+      /* purecov: end */
     }
   }
-  join->join_tab[join->tables-1].next_select=0; /* Set by do_select */
+  uint n_top_tables= join->join_tab_ranges.head()->end -  
+                     join->join_tab_ranges.head()->start;
+
+  join->join_tab[n_top_tables - 1].next_select=0;  /* Set by do_select */
   
-/*
+  /*
     If a join buffer is used to join a table the ordering by an index
     for the first non-constant table cannot be employed anymore.
   */
-  for (i=join->const_tables ; i < join->tables ; i++)
+  for (tab= join->join_tab + join->const_tables ; 
+       tab != join->join_tab + n_top_tables ; tab++)
   {
-    JOIN_TAB *tab=join->join_tab+i;
     if (tab->use_join_cache)
     {
-      JOIN_TAB *sort_by_tab= join->get_sort_by_join_tab();
-      if (sort_by_tab && !join->need_tmp)
+       JOIN_TAB *sort_by_tab= join->group && join->simple_group &&
+                              join->group_list ?
+			       join->join_tab+join->const_tables :
+                               join->get_sort_by_join_tab();
+     if (sort_by_tab)
       {
         join->need_tmp= 1;
         join->simple_order= join->simple_group= 0;
@@ -7913,6 +9592,11 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
           sort_by_tab->type= JT_ALL;
           sort_by_tab->read_first_record= join_init_read_record;
         }
+        else if (sort_by_tab->type == JT_HASH_NEXT)
+        {
+          sort_by_tab->type= JT_HASH;
+          sort_by_tab->read_first_record= join_init_read_record;
+        }
       }
       break;
     }
@@ -7938,9 +9622,8 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
 
 bool error_if_full_join(JOIN *join)
 {
-  for (JOIN_TAB *tab=join->join_tab, *end=join->join_tab+join->tables;
-       tab < end;
-       tab++)
+  for (JOIN_TAB *tab=first_top_level_tab(join, WITH_CONST_TABLES); tab;
+       tab= next_top_level_tab(join, tab))
   {
     if (tab->type == JT_ALL && (!tab->select || !tab->select->quick))
     {
@@ -7957,10 +9640,17 @@ bool error_if_full_join(JOIN *join)
 
 /**
   cleanup JOIN_TAB.
+
+  DESCRIPTION 
+    This is invoked when we've finished all join executions.
 */
 
 void JOIN_TAB::cleanup()
 {
+  DBUG_ENTER("JOIN_TAB::cleanup");
+  DBUG_PRINT("enter", ("table %s.%s",
+                       (table ? table->s->db.str : "?"),
+                       (table ? table->s->table_name.str : "?")));
   delete select;
   select= 0;
   delete quick;
@@ -7975,6 +9665,15 @@ void JOIN_TAB::cleanup()
   {
     table->disable_keyread();
     table->file->ha_index_or_rnd_end();
+    preread_init_done= FALSE;
+    if (table->pos_in_table_list && 
+        table->pos_in_table_list->jtbm_subselect)
+    {
+      end_read_record(&read_record);
+      //psergey-merge:
+      table->pos_in_table_list->jtbm_subselect->cleanup();
+      DBUG_VOID_RETURN;
+    }
     /*
       We need to reset this for next select
       (Tested in part_of_refkey)
@@ -7982,15 +9681,79 @@ void JOIN_TAB::cleanup()
     table->reginfo.join_tab= 0;
   }
   end_read_record(&read_record);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Estimate the time to get rows of the joined table
+*/
+
+double JOIN_TAB::scan_time()
+{
+  double res;
+  if (table->created)
+  {
+    if (table->is_filled_at_execution())
+    {
+      get_delayed_table_estimates(table, &records, &read_time,
+                                    &startup_cost);
+      found_records= records;
+      table->quick_condition_rows= records;
+    }
+    else
+    {
+      found_records= records= table->file->stats.records;
+      read_time= table->file->scan_time();
+      /*
+        table->quick_condition_rows has already been set to
+        table->file->stats.records
+      */
+    }
+    res= read_time;
+  }
+  else
+  {
+    found_records= records=table->file->stats.records;
+    read_time= found_records ? (double)found_records: 10.0;// TODO:fix this stub
+    res= read_time;
+  }
+  return res;
+}
+
+/**
+  Initialize the join_tab before reading.
+  Currently only derived table/view materialization is done here.
+
+  TODO: consider moving this together with join_tab_execution_startup
+*/
+bool JOIN_TAB::preread_init()
+{
+  TABLE_LIST *derived= table->pos_in_table_list;
+  if (!derived || !derived->is_materialized_derived())
+  {
+    preread_init_done= TRUE;
+    return FALSE;
+  }
+
+  /* Materialize derived table/view. */
+  if (!derived->get_unit()->executed &&
+      mysql_handle_single_derived(join->thd->lex,
+                                    derived, DT_CREATE | DT_FILL))
+      return TRUE;
+  preread_init_done= TRUE;
+  return FALSE;
 }
 
 
+
 /**
   Build a TABLE_REF structure for index lookup in the temporary table
 
   @param thd             Thread handle
   @param tmp_key         The temporary table key
   @param it              The iterator of items for lookup in the key
+  @param skip            Number of fields from the beginning to skip
 
   @details
   Build TABLE_REF object for lookup in the key 'tmp_key' using items
@@ -8003,9 +9766,11 @@ void JOIN_TAB::cleanup()
 bool TABLE_REF::tmp_table_index_lookup_init(THD *thd,
                                             KEY *tmp_key,
                                             Item_iterator &it,
-                                            bool value)
+                                            bool value,
+                                            uint skip)
 {
   uint tmp_key_parts= tmp_key->key_parts;
+  uint i;
   DBUG_ENTER("TABLE_REF::tmp_table_index_lookup_init");
 
   key= 0; /* The only temp table index. */
@@ -8026,7 +9791,8 @@ bool TABLE_REF::tmp_table_index_lookup_init(THD *thd,
   uchar *cur_ref_buff= key_buff;
 
   it.open();
-  for (uint i= 0; i < tmp_key_parts; i++, cur_key_part++, ref_key++)
+  for (i= 0; i < skip; i++) it.next();
+  for (i= 0; i < tmp_key_parts; i++, cur_key_part++, ref_key++)
   {
     Item *item= it.next();
     DBUG_ASSERT(item);
@@ -8102,7 +9868,7 @@ void JOIN::join_free()
     Optimization: if not EXPLAIN and we are done with the JOIN,
     free all tables.
   */
-  bool full= (!select_lex->uncacheable && !thd->lex->describe);
+  bool full= !(select_lex->uncacheable);
   bool can_unlock= full;
   DBUG_ENTER("JOIN::join_free");
 
@@ -8166,31 +9932,39 @@ void JOIN::join_free()
 void JOIN::cleanup(bool full)
 {
   DBUG_ENTER("JOIN::cleanup");
+  DBUG_PRINT("enter", ("full %u", (uint) full));
 
-  if (all_tables)
+  if (table)
   {
-    JOIN_TAB *tab,*end;
+    JOIN_TAB *tab;
     /*
       Only a sorted table may be cached.  This sorted table is always the
-      first non const table in join->all_tables
+      first non const table in join->table
     */
-    if (tables > const_tables) // Test for not-const tables
+    if (table_count > const_tables) // Test for not-const tables
     {
-      free_io_cache(all_tables[const_tables]);
-      filesort_free_buffers(all_tables[const_tables],full);
+      free_io_cache(table[const_tables]);
+      filesort_free_buffers(table[const_tables],full);
     }
 
     if (full)
     {
-      for (tab= join_tab, end= tab+tables; tab != end; tab++)
+      for (tab= first_linear_tab(this, WITH_CONST_TABLES); tab; 
+           tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
 	tab->cleanup();
+      table= 0;
     }
     else
     {
-      for (tab= join_tab, end= tab+tables; tab != end; tab++)
+      for (tab= first_linear_tab(this, WITH_CONST_TABLES); tab; 
+           tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
       {
 	if (tab->table)
+        {
+          DBUG_PRINT("info", ("close index: %s.%s", tab->table->s->db.str,
+                              tab->table->s->table_name.str));
           tab->table->file->ha_index_or_rnd_end();
+        }
       }
     }
   }
@@ -8252,6 +10026,8 @@ void JOIN::cleanup(bool full)
   SELECT * FROM t1,t2 WHERE t1.a=t2.a AND t1.b=t2.b ORDER BY t1.a,t2.c
   SELECT * FROM t1,t2 WHERE t1.a=t2.a ORDER BY t2.b,t1.a
   @endcode
+
+  TODO: this function checks ORDER::used, which can only have a value of 0.
 */
 
 static bool
@@ -8325,9 +10101,8 @@ only_eq_ref_tables(JOIN *join,ORDER *order,table_map tables)
 
 static void update_depend_map(JOIN *join)
 {
-  JOIN_TAB *join_tab=join->join_tab, *end=join_tab+join->tables;
-
-  for (; join_tab != end ; join_tab++)
+  for (JOIN_TAB *join_tab= first_linear_tab(join, WITH_CONST_TABLES); join_tab;
+       join_tab= next_linear_tab(join, join_tab, WITH_BUSH_ROOTS))
   {
     TABLE_REF *ref= &join_tab->ref;
     table_map depend_map=0;
@@ -8338,11 +10113,11 @@ static void update_depend_map(JOIN *join)
     ref->depend_map=depend_map & ~OUTER_REF_TABLE_BIT;
     depend_map&= ~OUTER_REF_TABLE_BIT;
     for (JOIN_TAB **tab=join->map2table;
-	 depend_map ;
-	 tab++,depend_map>>=1 )
+         depend_map ;
+         tab++,depend_map>>=1 )
     {
       if (depend_map & 1)
-	ref->depend_map|=(*tab)->ref.depend_map;
+        ref->depend_map|=(*tab)->ref.depend_map;
     }
   }
 }
@@ -8350,7 +10125,7 @@ static void update_depend_map(JOIN *join)
 
 /** Update the dependency map for the sort order. */
 
-static void update_depend_map(JOIN *join, ORDER *order)
+static void update_depend_map_for_order(JOIN *join, ORDER *order)
 {
   for (; order ; order=order->next)
   {
@@ -8397,21 +10172,30 @@ static ORDER *
 remove_const(JOIN *join,ORDER *first_order, COND *cond,
              bool change_list, bool *simple_order)
 {
-  if (join->tables == join->const_tables)
+  if (join->table_count == join->const_tables)
     return change_list ? 0 : first_order;		// No need to sort
 
   ORDER *order,**prev_ptr;
-  table_map first_table= join->join_tab[join->const_tables].table->map;
+  table_map first_table;
   table_map not_const_tables= ~join->const_table_map;
   table_map ref;
+  bool first_is_base_table= FALSE;
   DBUG_ENTER("remove_const");
+  
+  LINT_INIT(first_table); /* protected by first_is_base_table */
+  if (join->join_tab[join->const_tables].table)
+  {
+    first_table= join->join_tab[join->const_tables].table->map;
+    first_is_base_table= TRUE;
+  }
+  
 
   prev_ptr= &first_order;
   *simple_order= *join->join_tab[join->const_tables].on_expr_ref ? 0 : 1;
 
   /* NOTE: A variable of not_const_tables ^ first_table; breaks gcc 2.7 */
 
-  update_depend_map(join, first_order);
+  update_depend_map_for_order(join, first_order);
   for (order=first_order; order ; order=order->next)
   {
     table_map order_tables=order->item[0]->used_tables();
@@ -8426,16 +10210,21 @@ remove_const(JOIN *join,ORDER *first_order, COND *cond,
           table for all queries containing more than one table, ROLLUP, and an
           outer join.
          */
-        (join->tables > 1 && join->rollup.state == ROLLUP::STATE_INITED &&
+        (join->table_count > 1 && join->rollup.state == ROLLUP::STATE_INITED &&
         join->outer_join))
       *simple_order=0;				// Must do a temp table to sort
     else if (!(order_tables & not_const_tables))
     {
-      if (order->item[0]->with_subselect && 
-          !(join->select_lex->options & SELECT_DESCRIBE))
-        order->item[0]->val_str(&order->item[0]->str_value);
+      if (order->item[0]->with_subselect)
+      {
+        /*
+          Delay the evaluation of constant ORDER and/or GROUP expressions that
+          contain subqueries until the execution phase.
+        */
+        join->exec_const_order_group_cond.push_back(order->item[0]);
+      }
       DBUG_PRINT("info",("removing: %s", order->item[0]->full_name()));
-      continue;					// skip const item
+      continue;
     }
     else
     {
@@ -8448,7 +10237,7 @@ remove_const(JOIN *join,ORDER *first_order, COND *cond,
 	  DBUG_PRINT("info",("removing: %s", order->item[0]->full_name()));
 	  continue;
 	}
-	if ((ref=order_tables & (not_const_tables ^ first_table)))
+	if (first_is_base_table && (ref=order_tables & (not_const_tables ^ first_table)))
 	{
 	  if (!(order_tables & first_table) &&
               only_eq_ref_tables(join,first_order, ref))
@@ -8514,7 +10303,7 @@ ORDER *simple_remove_const(ORDER *order, COND *where)
 
 
 static int
-return_zero_rows(JOIN *join, select_result *result,TABLE_LIST *tables,
+return_zero_rows(JOIN *join, select_result *result, List<TABLE_LIST> &tables,
 		 List<Item> &fields, bool send_row, ulonglong select_options,
 		 const char *info, Item *having)
 {
@@ -8530,9 +10319,13 @@ return_zero_rows(JOIN *join, select_result *result,TABLE_LIST *tables,
 
   if (send_row)
   {
-    for (TABLE_LIST *table= tables; table; table= table->next_leaf)
+    List_iterator<TABLE_LIST> ti(tables);
+    TABLE_LIST *table;
+    while ((table= ti++))
       mark_as_null_row(table->table);		// All fields are NULL
-    if (having && having->val_int() == 0)
+    if (having &&
+        !having->walk(&Item::clear_sum_processor, FALSE, NULL) &&
+        having->val_int() == 0)
       send_row=0;
   }
   if (!(result->send_result_set_metadata(fields,
@@ -8545,7 +10338,7 @@ return_zero_rows(JOIN *join, select_result *result,TABLE_LIST *tables,
       Item *item;
       while ((item= it++))
 	item->no_rows_in_result();
-      send_error= result->send_data(fields);
+      send_error= result->send_data(fields) > 0;
     }
     if (!send_error)
       result->send_eof();				// Should be safe
@@ -8564,8 +10357,11 @@ static void clear_tables(JOIN *join)
     must clear only the non-const tables, as const tables
     are not re-calculated.
   */
-  for (uint i=join->const_tables ; i < join->tables ; i++)
-    mark_as_null_row(join->all_tables[i]);		// All fields are NULL
+  for (uint i= 0 ; i < join->table_count ; i++)
+  {
+    if (!(join->table[i]->map & join->const_table_map))
+      mark_as_null_row(join->table[i]);		// All fields are NULL
+  }
 }
 
 /*****************************************************************************
@@ -8727,24 +10523,26 @@ finish:
 static bool check_simple_equality(Item *left_item, Item *right_item,
                                   Item *item, COND_EQUAL *cond_equal)
 {
+  Item *orig_left_item= left_item;
+  Item *orig_right_item= right_item;
   if (left_item->type() == Item::REF_ITEM &&
       ((Item_ref*)left_item)->ref_type() == Item_ref::VIEW_REF)
   {
-    if (((Item_ref*)left_item)->depended_from)
+    if (((Item_ref*)left_item)->get_depended_from())
       return FALSE;
     left_item= left_item->real_item();
   }
   if (right_item->type() == Item::REF_ITEM &&
       ((Item_ref*)right_item)->ref_type() == Item_ref::VIEW_REF)
   {
-    if (((Item_ref*)right_item)->depended_from)
+    if (((Item_ref*)right_item)->get_depended_from())
       return FALSE;
     right_item= right_item->real_item();
   }
   if (left_item->type() == Item::FIELD_ITEM &&
       right_item->type() == Item::FIELD_ITEM &&
-      !((Item_field*)left_item)->depended_from &&
-      !((Item_field*)right_item)->depended_from)
+      !((Item_field*)left_item)->get_depended_from() &&
+      !((Item_field*)right_item)->get_depended_from())
   {
     /* The predicate the form field1=field2 is processed */
 
@@ -8793,7 +10591,7 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
     { 
       /* left item was found in the current or one of the upper levels */
       if (! right_item_equal)
-        left_item_equal->add((Item_field *) right_item);
+        left_item_equal->add(orig_right_item);
       else
       {
         /* Merge two multiple equalities forming a new one */
@@ -8808,12 +10606,13 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
     { 
       /* left item was not found neither the current nor in upper levels  */
       if (right_item_equal)
-        right_item_equal->add((Item_field *) left_item);
+        right_item_equal->add(orig_left_item);
       else 
       {
         /* None of the fields was found in multiple equalities */
-        Item_equal *item_equal= new Item_equal((Item_field *) left_item,
-                                               (Item_field *) right_item);
+        Item_equal *item_equal= new Item_equal(orig_left_item,
+                                               orig_right_item,
+                                               FALSE);
         cond_equal->current_level.push_back(item_equal);
       }
     }
@@ -8824,18 +10623,21 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
     /* The predicate of the form field=const/const=field is processed */
     Item *const_item= 0;
     Item_field *field_item= 0;
+    Item *orig_field_item= 0;
     if (left_item->type() == Item::FIELD_ITEM &&
-        !((Item_field*)left_item)->depended_from &&
-        right_item->const_item())
+        !((Item_field*)left_item)->get_depended_from() &&
+        right_item->const_item() && !right_item->is_expensive())
     {
-      field_item= (Item_field*) left_item;
+      orig_field_item= orig_left_item;
+      field_item= (Item_field *) left_item;
       const_item= right_item;
     }
     else if (right_item->type() == Item::FIELD_ITEM &&
-             !((Item_field*)right_item)->depended_from &&
-             left_item->const_item())
+             !((Item_field*)right_item)->get_depended_from() &&
+             left_item->const_item() && !left_item->is_expensive())
     {
-      field_item= (Item_field*) right_item;
+      orig_field_item= orig_right_item;
+      field_item= (Item_field *) right_item;
       const_item= left_item;
     }
 
@@ -8844,13 +10646,13 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
     {
       bool copyfl;
 
-      if (field_item->result_type() == STRING_RESULT)
+      if (field_item->cmp_type() == STRING_RESULT)
       {
         CHARSET_INFO *cs= ((Field_str*) field_item->field)->charset();
         if (!item)
         {
           Item_func_eq *eq_item;
-          if ((eq_item= new Item_func_eq(left_item, right_item)))
+          if ((eq_item= new Item_func_eq(orig_left_item, orig_right_item)))
             return FALSE;
           eq_item->set_cmp_func();
           eq_item->quick_fix_field();
@@ -8875,11 +10677,11 @@ static bool check_simple_equality(Item *left_item, Item *right_item,
           already contains a constant and its value is  not equal to
           the value of const_item.
         */
-        item_equal->add(const_item, field_item);
+        item_equal->add_const(const_item, orig_field_item);
       }
       else
       {
-        item_equal= new Item_equal(const_item, field_item);
+        item_equal= new Item_equal(const_item, orig_field_item, TRUE);
         cond_equal->current_level.push_back(item_equal);
       }
       return TRUE;
@@ -9121,10 +10923,10 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
       List_iterator_fast<Item_equal> it(cond_equal.current_level);
       while ((item_equal= it++))
       {
-        item_equal->fix_length_and_dec();
+        item_equal->fix_fields(thd, NULL);
         item_equal->update_used_tables();
         set_if_bigger(thd->lex->current_select->max_equal_elems,
-                      item_equal->members());  
+                      item_equal->n_field_items());  
       }
 
       ((Item_cond_and*)cond)->cond_equal= cond_equal;
@@ -9155,7 +10957,8 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
       args->concat((List<Item> *)&cond_equal.current_level);
     }
   }
-  else if (cond->type() == Item::FUNC_ITEM)
+  else if (cond->type() == Item::FUNC_ITEM ||
+           cond->real_item()->type() == Item::FIELD_ITEM)
   {
     List<Item> eq_list;
     /*
@@ -9177,10 +10980,10 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
       {
         if ((item_equal= cond_equal.current_level.pop()))
         {
-          item_equal->fix_length_and_dec();
+          item_equal->fix_fields(thd, NULL);
           item_equal->update_used_tables();
           set_if_bigger(thd->lex->current_select->max_equal_elems,
-                        item_equal->members());  
+                        item_equal->n_field_items());  
           return item_equal;
 	}
 
@@ -9201,7 +11004,7 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
           item_equal->fix_length_and_dec();
           item_equal->update_used_tables();
           set_if_bigger(thd->lex->current_select->max_equal_elems,
-                        item_equal->members());  
+                        item_equal->n_field_items());  
         }
         and_cond->cond_equal= cond_equal;
         args->concat((List<Item> *)&cond_equal.current_level);
@@ -9215,7 +11018,7 @@ static COND *build_equal_items_for_cond(THD *thd, COND *cond,
       as soon the field is not of a string type or the field reference is
       an argument of a comparison predicate. 
     */ 
-    uchar *is_subst_valid= (uchar *) 1;
+    uchar* is_subst_valid= (uchar *) Item::ANY_SUBST;
     cond= cond->compile(&Item::subst_argument_checker,
                         &is_subst_valid, 
                         &Item::equal_fields_propagator,
@@ -9349,10 +11152,14 @@ static COND *build_equal_items(THD *thd, COND *cond, COND_EQUAL *inherited,
 /**
   Compare field items by table order in the execution plan.
 
+    If field1 and field2 belong to different tables then
     field1 considered as better than field2 if the table containing
     field1 is accessed earlier than the table containing field2.   
     The function finds out what of two fields is better according
     this criteria.
+    If field1 and field2 belong to the same table then the result
+    of comparison depends on whether the fields are parts of
+    the key that are used to access this table.  
 
   @param field1          first field item to compare
   @param field2          second field item to compare
@@ -9366,18 +11173,24 @@ static COND *build_equal_items(THD *thd, COND *cond, COND_EQUAL *inherited,
     0  otherwise
 */
 
-static int compare_fields_by_table_order(Item_field *field1,
-                                  Item_field *field2,
-                                  void *table_join_idx)
+static int compare_fields_by_table_order(Item *field1,
+                                         Item *field2,
+                                         void *table_join_idx)
 {
   int cmp= 0;
   bool outer_ref= 0;
-  if (field2->used_tables() & OUTER_REF_TABLE_BIT)
+  Item_field *f1= (Item_field *) (field1->real_item());
+  Item_field *f2= (Item_field *) (field2->real_item());
+  if (f1->const_item())
+    return 1;
+  if (f2->const_item())
+    return -1;
+  if (f2->used_tables() & OUTER_REF_TABLE_BIT)
   {  
     outer_ref= 1;
     cmp= -1;
   }
-  if (field2->used_tables() & OUTER_REF_TABLE_BIT)
+  if (f1->used_tables() & OUTER_REF_TABLE_BIT)
   {
     outer_ref= 1;
     cmp++;
@@ -9385,13 +11198,68 @@ static int compare_fields_by_table_order(Item_field *field1,
   if (outer_ref)
     return cmp;
   JOIN_TAB **idx= (JOIN_TAB **) table_join_idx;
-  cmp= idx[field2->field->table->tablenr]-idx[field1->field->table->tablenr];
+  
+  JOIN_TAB *tab1= idx[f1->field->table->tablenr];
+  JOIN_TAB *tab2= idx[f2->field->table->tablenr];
+  
+  /* 
+    if one of the table is inside a merged SJM nest and another one isn't,
+    compare SJM bush roots of the tables.
+  */
+  if (tab1->bush_root_tab != tab2->bush_root_tab)
+  {
+    if (tab1->bush_root_tab)
+      tab1= tab1->bush_root_tab;
+
+    if (tab2->bush_root_tab)
+      tab2= tab2->bush_root_tab;
+  }
+  
+  cmp= tab2 - tab1;
+
+  if (!cmp)
+  {
+    JOIN_TAB *tab= idx[f1->field->table->tablenr];
+    uint keyno= MAX_KEY;
+    if (tab->ref.key_parts)
+      keyno= tab->ref.key;
+    else if (tab->select && tab->select->quick)
+       keyno = tab->select->quick->index;
+    if (keyno != MAX_KEY)
+    {
+      if (f2->field->part_of_key.is_set(keyno))
+        cmp= -1;
+      if (f1->field->part_of_key.is_set(keyno))
+        cmp++;
+      if (!cmp)
+      {
+        KEY *key_info= tab->table->key_info + keyno;
+        for (uint i= 0; i < key_info->key_parts; i++)
+	{
+          Field *fld= key_info->key_part[i].field;
+          if (fld->eq(f2->field))
+	  {
+	    cmp= -1;
+            break;
+          }
+          if (fld->eq(f1->field))
+	  {
+	    cmp= 1;
+            break;
+          }
+        }
+      }              
+    }              
+    else   
+      cmp= f2->field->field_index-f1->field->field_index;
+  }
   return cmp < 0 ? -1 : (cmp ? 1 : 0);
 }
 
 
-static TABLE_LIST* embedding_sjm(Item_field *item_field)
+static TABLE_LIST* embedding_sjm(Item *item)
 {
+  Item_field *item_field= (Item_field *) (item->real_item());
   TABLE_LIST *nest= item_field->field->table->pos_in_table_list->embedding;
   if (nest && nest->sj_mat_info && nest->sj_mat_info->is_used)
     return nest;
@@ -9464,7 +11332,7 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
   if (((Item *) item_equal)->const_item() && !item_equal->val_int())
     return new Item_int((longlong) 0,1); 
   Item *item_const= item_equal->get_const();
-  Item_equal_iterator it(*item_equal);
+  Item_equal_fields_iterator it(*item_equal);
   Item *head;
   DBUG_ASSERT(!cond || cond->type() == Item::COND_ITEM);
 
@@ -9473,34 +11341,40 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
 
   /* 
     Pick the "head" item: the constant one or the first in the join order
-    that's not inside some SJM nest.
+    (if the first in the join order happends to be inside an SJM nest, that's
+    ok, because this is where the value will be unpacked after
+    materialization).
   */
   if (item_const)
     head= item_const;
   else
   {
     TABLE_LIST *emb_nest;
-    Item_field *item_field;
-    head= item_field= item_equal->get_first(NULL);
+    head= item_equal->get_first(NULL);
     it++;
-    if ((emb_nest= embedding_sjm(item_field)))
+    if ((emb_nest= embedding_sjm(head)))
     {
       current_sjm= emb_nest;
       current_sjm_head= head;
     }
   }
 
-  Item_field *item_field;
+  Item *field_item;
   /*
     For each other item, generate "item=head" equality (except the tables that 
     are within SJ-Materialization nests, for those "head" is defined
     differently)
   */
-  while ((item_field= it++))
+  while ((field_item= it++))
   {
-    Item_equal *upper= item_field->find_item_equal(upper_levels);
-    Item_field *item= item_field;
-    TABLE_LIST *field_sjm= embedding_sjm(item_field);
+    Item_equal *upper= field_item->find_item_equal(upper_levels);
+    Item *item= field_item;
+    TABLE_LIST *field_sjm= embedding_sjm(field_item);
+    if (!field_sjm)
+    { 
+      current_sjm= NULL;
+      current_sjm_head= NULL;
+    }      
 
     /* 
       Check if "item_field=head" equality is already guaranteed to be true 
@@ -9512,8 +11386,8 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
         item= 0;
       else
       {
-        Item_equal_iterator li(*item_equal);
-        while ((item= li++) != item_field)
+        Item_equal_fields_iterator li(*item_equal);
+        while ((item= li++) != field_item)
         {
           if (item->find_item_equal(upper_levels) == upper)
             break;
@@ -9521,11 +11395,11 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
       }
     }
     
-    bool produce_equality= test(item == item_field);
+    bool produce_equality= test(item == field_item);
     if (!item_const && field_sjm && field_sjm != current_sjm)
     {
       /* Entering an SJM nest */
-      current_sjm_head= item_field;
+      current_sjm_head= field_item;
       if (!field_sjm->sj_mat_info->is_sj_scan)
         produce_equality= FALSE;
     }
@@ -9535,7 +11409,20 @@ Item *eliminate_item_equal(COND *cond, COND_EQUAL *upper_levels,
       if (eq_item)
         eq_list.push_back(eq_item);
       
-      eq_item= new Item_func_eq(item_field, current_sjm? current_sjm_head: head);
+      /*
+        If we're inside an SJM-nest (current_sjm!=NULL), and the multi-equality
+        doesn't include a constant, we should produce equality with the first
+        of the equals in this SJM.
+
+        In other cases, get the "head" item, which is either first of the
+        equals on top level, or the constant.
+      */
+      Item *head_item= (!item_const && current_sjm)? current_sjm_head: head;
+      Item *head_real_item=  head_item->real_item();
+      if (head_real_item->type() == Item::FIELD_ITEM)
+        head_item= head_real_item;
+      
+      eq_item= new Item_func_eq(field_item->real_item(), head_item);
 
       if (!eq_item)
         return 0;
@@ -9669,8 +11556,18 @@ static COND* substitute_for_best_equal_field(COND *cond,
     cond= eliminate_item_equal(0, cond_equal, item_equal);
     return cond ? cond : org_cond;
   }
-  else
-    cond->transform(&Item::replace_equal_field, 0);
+  else 
+  {
+    while (cond_equal)
+    {
+      List_iterator_fast<Item_equal> it(cond_equal->current_level);
+      while((item_equal= it++))
+      {
+        cond= cond->transform(&Item::replace_equal_field, (uchar *) item_equal);
+      }
+      cond_equal= cond_equal->upper_levels;
+    }
+  }
   return cond;
 }
 
@@ -9710,11 +11607,10 @@ static void update_const_equal_items(COND *cond, JOIN_TAB *tab)
     if (!contained_const && item_equal->get_const())
     {
       /* Update keys for range analysis */
-      Item_equal_iterator it(*item_equal);
-      Item_field *item_field;
-      while ((item_field= it++))
+      Item_equal_fields_iterator it(*item_equal);
+      while (it++)
       {
-        Field *field= item_field->field;
+        Field *field= it.get_curr_field();
         JOIN_TAB *stat= field->table->reginfo.join_tab;
         key_map possible_keys= field->key_start;
         possible_keys.intersect(field->table->keys_in_use_for_query);
@@ -9730,7 +11626,7 @@ static void update_const_equal_items(COND *cond, JOIN_TAB *tab)
           TABLE *tab= field->table;
           KEYUSE *use;
           for (use= stat->keyuse; use && use->table == tab; use++)
-            if (possible_keys.is_set(use->key) && 
+            if (!use->is_for_hash_join() && possible_keys.is_set(use->key) && 
                 tab->key_info[use->key].key_part[use->keypart].field ==
                 field)
               tab->const_key_parts[use->key]|= use->keypart_map;
@@ -9859,10 +11755,10 @@ propagate_cond_constants(THD *thd, I_List<COND_CMP> *save_list,
     {
       Item_func_eq *func=(Item_func_eq*) cond;
       Item **args= func->arguments();
-      bool left_const= args[0]->const_item();
-      bool right_const= args[1]->const_item();
+      bool left_const= args[0]->const_item() && !args[0]->is_expensive();
+      bool right_const= args[1]->const_item() && !args[1]->is_expensive();
       if (!(left_const && right_const) &&
-          args[0]->result_type() == args[1]->result_type())
+          args[0]->cmp_type() == args[1]->cmp_type())
       {
 	if (right_const)
 	{
@@ -10082,6 +11978,8 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top,
         For some of the inner tables there are conjunctive predicates
         that reject nulls => the outer join can be replaced by an inner join.
       */
+      if (table->outer_join && !table->embedding && table->table)
+        table->table->maybe_null= FALSE;
       table->outer_join= 0;
       if (table->on_expr)
       {
@@ -10171,17 +12069,33 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top,
          leave it intact (otherwise it is flattened)
        */
       join->select_lex->sj_nests.push_back(table);
+
+      /* 
+        Also, walk through semi-join children and mark those that are now
+        top-level
+      */
+      TABLE_LIST *tbl;
+      List_iterator<TABLE_LIST> it(nested_join->join_list);
+      while ((tbl= it++))
+      {
+        if (!tbl->on_expr && tbl->table)
+          tbl->table->maybe_null= FALSE;
+      }
     }
     else if (nested_join && !table->on_expr)
     {
       TABLE_LIST *tbl;
       List_iterator<TABLE_LIST> it(nested_join->join_list);
+      List<TABLE_LIST> repl_list;  
       while ((tbl= it++))
       {
         tbl->embedding= table->embedding;
+        if (!tbl->embedding && !tbl->on_expr && tbl->table)
+          tbl->table->maybe_null= FALSE;
         tbl->join_list= table->join_list;
+        repl_list.push_back(tbl);
       }
-      li.replace(nested_join->join_list);
+      li.replace(repl_list);
       /* Need to update the name resolution table chain when flattening joins */
       fix_name_res= TRUE;
       table= *li.ref();
@@ -10277,8 +12191,8 @@ static uint reset_nj_counters(JOIN *join, List<TABLE_LIST> *join_list)
       if (!nested_join->n_tables)
         is_eliminated_nest= TRUE;
     }
-    if ((!table->table && !is_eliminated_nest) || 
-        (table->table && (table->table->map & ~join->eliminated_tables)))
+    if ((table->nested_join && !is_eliminated_nest) || 
+        (!table->nested_join && (table->table->map & ~join->eliminated_tables)))
       n++;
   }
   DBUG_RETURN(n);
@@ -10517,8 +12431,6 @@ static void restore_prev_nj_state(JOIN_TAB *last)
                               table
       reopt_rec_count     OUT New output record count
       reopt_cost          OUT New join prefix cost
-      sj_inner_fanout     OUT Fanout in the [first_tab; last_tab] range that
-                              is produced by semi-join-inner tables.
 
   DESCRIPTION
     Given a join prefix [0; ... first_tab], change the access to the tables
@@ -10535,10 +12447,9 @@ static void restore_prev_nj_state(JOIN_TAB *last)
 void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab, 
                                 table_map last_remaining_tables, 
                                 bool first_alt, uint no_jbuf_before,
-                                double *reopt_rec_count, double *reopt_cost,
-                                double *sj_inner_fanout)
+                                double *outer_rec_count, double *reopt_cost)
 {
-  double cost, rec_count, inner_fanout= 1.0;
+  double cost, rec_count;
   table_map reopt_remaining_tables= last_remaining_tables;
   uint i;
 
@@ -10553,8 +12464,23 @@ void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab,
     rec_count= 1;
   }
 
+  *outer_rec_count= rec_count;
   for (i= first_tab; i <= last_tab; i++)
     reopt_remaining_tables |= join->positions[i].table->table->map;
+  
+  /*
+    best_access_path() optimization depends on the value of 
+    join->cur_sj_inner_tables. Our goal in this function is to do a
+    re-optimization with disabled join buffering, but no other changes.
+    In order to achieve this, cur_sj_inner_tables needs have the same 
+    value it had during the original invocations of best_access_path. 
+
+    We know that this function, optimize_wo_join_buffering() is called to
+    re-optimize semi-join join order range, which allows to conclude that 
+    the "original" value of cur_sj_inner_tables was 0.
+  */
+  table_map save_cur_sj_inner_tables= join->cur_sj_inner_tables;
+  join->cur_sj_inner_tables= 0;
 
   for (i= first_tab; i <= last_tab; i++)
   {
@@ -10565,7 +12491,7 @@ void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab,
     {
       /* Find the best access method that would not use join buffering */
       best_access_path(join, rs, reopt_remaining_tables, i, 
-                       test(i < no_jbuf_before), rec_count,
+                       TRUE, rec_count,
                        &pos, &loose_scan_pos);
     }
     else 
@@ -10578,13 +12504,12 @@ void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab,
     rec_count *= pos.records_read;
     cost += pos.read_time;
 
-    if (rs->emb_sj_nest)
-      inner_fanout *= pos.records_read;
+    if (!rs->emb_sj_nest)
+      *outer_rec_count *= pos.records_read;
   }
+  join->cur_sj_inner_tables= save_cur_sj_inner_tables;
 
-  *reopt_rec_count= rec_count;
   *reopt_cost= cost;
-  *sj_inner_fanout= inner_fanout;
 }
 
 
@@ -10630,7 +12555,7 @@ optimize_cond(JOIN *join, COND *conds, List<TABLE_LIST> *join_list,
 
 
 /**
-  Handles the reqursive job for remove_eq_conds()
+  Handles the recursive job  remove_eq_conds()
 
   Remove const and eq items. Return new item, or NULL if no condition
   cond_value is set to according:
@@ -10638,8 +12563,8 @@ optimize_cond(JOIN *join, COND *conds, List<TABLE_LIST> *join_list,
   COND_TRUE  always true	( 1 = 1 )
   COND_FALSE always false	( 1 = 2 )
 
-  SYNPOSIS
-    remove_eq_conds()
+  SYNOPSIS
+    internal_remove_eq_conds()
     thd 			THD environment
     cond                        the condition to handle
     cond_value                  the resulting value of the condition
@@ -10756,23 +12681,13 @@ internal_remove_eq_conds(THD *thd, COND *cond, Item::cond_result *cond_value)
         cond->fix_fields(thd, &cond);
       }
     }
-    if (cond->const_item())
+    if (cond->const_item() && !cond->is_expensive())
     {
       *cond_value= eval_const_cond(cond) ? Item::COND_TRUE : Item::COND_FALSE;
       return (COND*) 0;
     }
   }
   else if (cond->const_item() && !cond->is_expensive())
-  /*
-    DontEvaluateMaterializedSubqueryTooEarly:
-    TODO: 
-    Excluding all expensive functions is too restritive we should exclude only
-    materialized IN subquery predicates because they can't yet be evaluated
-    here (they need additional initialization that is done later on).
-
-    The proper way to exclude the subqueries would be to walk the cond tree and
-    check for materialized subqueries there.
-  */
   {
     *cond_value= eval_const_cond(cond) ? Item::COND_TRUE : Item::COND_FALSE;
     return (COND*) 0;
@@ -10792,7 +12707,6 @@ internal_remove_eq_conds(THD *thd, COND *cond, Item::cond_result *cond_value)
   return cond;					// Point at next and level
 }
 
-
 /**
   Remove const and eq items. Return new item, or NULL if no condition
   cond_value is set to according:
@@ -10874,37 +12788,33 @@ remove_eq_conds(THD *thd, COND *cond, Item::cond_result *cond_value)
 /* 
   Check if equality can be used in removing components of GROUP BY/DISTINCT
   
-  SYNOPSIS
-    test_if_equality_guarantees_uniqueness()
-      l          the left comparison argument (a field if any)
-      r          the right comparison argument (a const of any)
-  
-  DESCRIPTION    
-    Checks if an equality predicate can be used to take away 
-    DISTINCT/GROUP BY because it is known to be true for exactly one 
-    distinct value (e.g. <expr> == <const>).
-    Arguments must be of the same type because e.g. 
-    <string_field> = <int_const> may match more than 1 distinct value from 
-    the column. 
-    We must take into consideration and the optimization done for various 
-    string constants when compared to dates etc (see Item_int_with_ref) as
-    well as the collation of the arguments.
+  @param    l          the left comparison argument (a field if any)
+  @param    r          the right comparison argument (a const of any)
   
-  RETURN VALUE  
-    TRUE    can be used
-    FALSE   cannot be used
+  @details
+  Checks if an equality predicate can be used to take away 
+  DISTINCT/GROUP BY because it is known to be true for exactly one 
+  distinct value (e.g. <expr> == <const>).
+  Arguments must be compared in the native type of the left argument
+  and (for strings) in the native collation of the left argument.
+  Otherwise, for example,
+  <string_field> = <int_const> may match more than 1 distinct value or
+  the <string_field>.
+
+  @note We don't need to aggregate l and r collations here, because r -
+  the constant item - has already been converted to a proper collation
+  for comparison. We only need to compare this collation with field's collation.
+
+  @retval true    can be used
+  @retval false   cannot be used
 */
 static bool
 test_if_equality_guarantees_uniqueness(Item *l, Item *r)
 {
   return r->const_item() &&
-    /* elements must be compared as dates */
-     (Arg_comparator::can_compare_as_dates(l, r, 0) ||
-      /* or of the same result type */
-      (r->result_type() == l->result_type() &&
-       /* and must have the same collation if compared as strings */
-       (l->result_type() != STRING_RESULT ||
-        l->collation.collation == r->collation.collation)));
+    item_cmp_type(l->cmp_type(), r->cmp_type()) == l->cmp_type() &&
+    (l->cmp_type() != STRING_RESULT ||
+     l->collation.collation == r->collation.collation);
 }
 
 
@@ -11125,15 +13035,12 @@ static Field *create_tmp_field_from_item(THD *thd, Item *item, TABLE *table,
   case STRING_RESULT:
     DBUG_ASSERT(item->collation.collation);
   
-    enum enum_field_types type;
     /*
       DATE/TIME and GEOMETRY fields have STRING_RESULT result type. 
       To preserve type they needed to be handled separately.
     */
-    if ((type= item->field_type()) == MYSQL_TYPE_DATETIME ||
-        type == MYSQL_TYPE_TIME || type == MYSQL_TYPE_DATE ||
-        type == MYSQL_TYPE_NEWDATE ||
-        type == MYSQL_TYPE_TIMESTAMP || type == MYSQL_TYPE_GEOMETRY)
+    if (item->cmp_type() == TIME_RESULT ||
+        item->field_type() == MYSQL_TYPE_GEOMETRY)
       new_field= item->tmp_table_field_from_field_type(table, 1);
     /* 
       Make sure that the blob fits into a Field_varstring which has 
@@ -11274,13 +13181,30 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
       If item have to be able to store NULLs but underlaid field can't do it,
       create_tmp_field_from_field() can't be used for tmp field creation.
     */
-    if (field->maybe_null && !field->field->maybe_null())
+    if (((field->maybe_null && field->in_rollup) ||      
+	(thd->create_tmp_table_for_derived  &&    /* for mat. view/dt */
+	 orig_item && orig_item->maybe_null)) &&         
+        !field->field->maybe_null())
     {
+      bool save_maybe_null= FALSE;
+      /*
+        The item the ref points to may have maybe_null flag set while
+        the ref doesn't have it. This may happen for outer fields
+        when the outer query decided at some point after name resolution phase
+        that this field might be null. Take this into account here.
+      */
+      if (orig_item)
+      {
+        save_maybe_null= item->maybe_null;
+        item->maybe_null= orig_item->maybe_null;
+      }
       result= create_tmp_field_from_item(thd, item, table, NULL,
                                          modify_item, convert_blob_length);
       *from_field= field->field;
       if (result && modify_item)
         field->result_field= result;
+      if (orig_item)
+        item->maybe_null= save_maybe_null;
     } 
     else if (table_cant_handle_bit_fields && field->field->type() ==
              MYSQL_TYPE_BIT)
@@ -11303,7 +13227,7 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
       ((Item_ref*)orig_item)->set_result_field(result);
     /*
       Fields that are used as arguments to the DEFAULT() function already have
-      their data pointers set to the default value during name resulotion. See
+      their data pointers set to the default value during name resolution. See
       Item_default_value::fix_fields.
     */
     if (orig_type != Item::DEFAULT_VALUE_ITEM && field->field->eq_def(result))
@@ -11393,7 +13317,9 @@ void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps)
   bitmap_init(&table->tmp_set,
               (my_bitmap_map*) (bitmaps+ 2*bitmap_buffer_size(field_count)),
               field_count, FALSE);
-
+  bitmap_init(&table->eq_join_set,
+              (my_bitmap_map*) (bitmaps+ 3*bitmap_buffer_size(field_count)),
+              field_count, FALSE);
   /* write_set and all_set are copies of read_set */
   table->def_write_set= table->def_read_set;
   table->s->all_set= table->def_read_set;
@@ -11429,10 +13355,10 @@ void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps)
 */
 
 TABLE *
-create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
+create_tmp_table(THD *thd, TMP_TABLE_PARAM *param, List<Item> &fields,
 		 ORDER *group, bool distinct, bool save_sum_fields,
 		 ulonglong select_options, ha_rows rows_limit,
-		 const char *table_alias)
+                 const char *table_alias, bool do_not_open)
 {
   MEM_ROOT *mem_root_save, own_root;
   TABLE *table;
@@ -11447,7 +13373,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   bool  using_unique_constraint= 0;
   bool  use_packed_rows= 0;
   bool  not_all_columns= !(select_options & TMP_TABLE_ALL_COLUMNS);
-  char  *tmpname,path[FN_REFLEN], tmp_table_name[50];
+  char  *tmpname,path[FN_REFLEN];
   uchar	*pos, *group_buff, *bitmaps;
   uchar *null_flags;
   Field **reg_field, **from_field, **default_field;
@@ -11478,12 +13404,12 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     temp_pool_slot = bitmap_lock_set_next(&temp_pool);
 
   if (temp_pool_slot != MY_BIT_NONE) // we got a slot
-    sprintf(tmp_table_name, "%s_%lx_%i", tmp_file_prefix,
+    sprintf(path, "%s_%lx_%i", tmp_file_prefix,
             current_pid, temp_pool_slot);
   else
   {
     /* if we run out of slots or we are not using tempool */
-    sprintf(tmp_table_name, "%s%lx_%lx_%x", tmp_file_prefix,current_pid,
+    sprintf(path, "%s%lx_%lx_%x", tmp_file_prefix,current_pid,
             thd->thread_id, thd->tmp_table++);
   }
 
@@ -11491,7 +13417,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     No need to change table name to lower case as we are only creating
     MyISAM, Aria or HEAP tables here
   */
-  fn_format(path, tmp_table_name, mysql_tmpdir, "",
+  fn_format(path, path, mysql_tmpdir, "",
             MY_REPLACE_EXT|MY_UNPACK_FILENAME);
 
   if (group)
@@ -11544,10 +13470,10 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
                         sizeof(*key_part_info)*(param->group_parts+1),
                         &param->start_recinfo,
                         sizeof(*param->recinfo)*(field_count*2+4),
-                        &tmpname, (uint) strlen(tmp_table_name)+1,
+                        &tmpname, (uint) strlen(path)+1,
                         &group_buff, (group && ! using_unique_constraint ?
                                       param->group_length : 0),
-                        &bitmaps, bitmap_buffer_size(field_count)*3,
+                        &bitmaps, bitmap_buffer_size(field_count)*4,
                         NullS))
   {
     if (temp_pool_slot != MY_BIT_NONE)
@@ -11563,7 +13489,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     DBUG_RETURN(NULL);				/* purecov: inspected */
   }
   param->items_to_copy= copy_func;
-  strmov(tmpname, tmp_table_name);
+  strmov(tmpname, path);
   /* make table according to fields */
 
   bzero((char*) table,sizeof(*table));
@@ -11576,9 +13502,9 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   thd->mem_root= &table->mem_root;
 
   table->field=reg_field;
-  table->alias= table_alias;
+  table->alias.set(table_alias, strlen(table_alias), table_alias_charset);
+
   table->reginfo.lock_type=TL_WRITE;	/* Will be updated */
-  table->db_stat=HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
   table->map=1;
   table->temp_pool_slot = temp_pool_slot;
   table->copy_blobs= 1;
@@ -11586,13 +13512,13 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   table->quick_keys.init();
   table->covering_keys.init();
   table->merge_keys.init();
+  table->intersect_keys.init();
   table->keys_in_use_for_query.init();
 
   table->s= share;
   init_tmp_table_share(thd, share, "", 0, tmpname, tmpname);
   share->blob_field= blob_field;
   share->blob_ptr_size= portable_sizeof_char_ptr;
-  share->db_low_byte_first=1;                // True for HEAP, MyISAM and Maria
   share->table_charset= param->table_charset;
   share->primary_key= MAX_KEY;               // Indicate no primary key
   share->keys_for_keyread.init();
@@ -11618,9 +13544,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       {
         if (item->used_tables() & OUTER_REF_TABLE_BIT)
           item->update_used_tables();
-        if (type == Item::SUBSELECT_ITEM ||
-            (item->get_cached_item() &&
-             item->get_cached_item()->type() == Item::SUBSELECT_ITEM ) ||
+        if ((item->real_type() == Item::SUBSELECT_ITEM) ||
             (item->used_tables() & ~OUTER_REF_TABLE_BIT))
         {
 	  /*
@@ -11713,7 +13637,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
                            for distinct, as we want the distinct index to be
                            usable in this case too.
                          */
-                         item->marker == 4  || param->bit_fields_as_long, // psergey-feb17
+                         item->marker == 4  || param->bit_fields_as_long,
                          force_copy_fields,
                          param->convert_blob_length);
 
@@ -11721,10 +13645,30 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       {
 	if (thd->is_fatal_error)
 	  goto err;				// Got OOM
-	continue;				// Some kindf of const item
+	continue;				// Some kind of const item
       }
       if (type == Item::SUM_FUNC_ITEM)
-	((Item_sum *) item)->result_field= new_field;
+      {
+        Item_sum *agg_item= (Item_sum *) item;
+        /*
+          Update the result field only if it has never been set, or if the
+          created temporary table is not to be used for subquery
+          materialization.
+
+          The reason is that for subqueries that require materialization as part
+          of their plan, we create the 'external' temporary table needed for IN
+          execution, after the 'internal' temporary table needed for grouping.
+          Since both the external and the internal temporary tables are created
+          for the same list of SELECT fields of the subquery, setting
+          'result_field' for each invocation of create_tmp_table overrides the
+           previous value of 'result_field'.
+
+          The condition below prevents the creation of the external temp table
+          to override the 'result_field' that was set for the internal temp table.
+        */
+        if (!agg_item->result_field || !param->materialized_subquery)
+          agg_item->result_field= new_field;
+      }
       tmp_from_field++;
       reclength+=new_field->pack_length();
       if (!(new_field->flags & NOT_NULL_FLAG))
@@ -11779,6 +13723,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   *reg_field= 0;
   *blob_field= 0;				// End marker
   share->fields= field_count;
+  share->column_bitmap_size= bitmap_buffer_size(share->fields);
 
   /* If result table is small; use a heap */
   /* future: storage engine selection can be made dynamic? */
@@ -11954,11 +13899,11 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       null_count=(null_count+7) & ~7;		// move to next byte
 
     // fix table name in field entry
-    field->table_name= &table->alias;
+    field->set_table_name(&table->alias);
   }
 
   param->copy_field_end=copy;
-  param->recinfo=recinfo;
+  param->recinfo= recinfo;              	// Pointer to after last field
   store_record(table,s->default_values);        // Make empty default record
 
   if (thd->variables.tmp_table_size == ~ (ulonglong) 0)		// No limit
@@ -11987,8 +13932,10 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
     share->keys=1;
     share->uniques= test(using_unique_constraint);
     table->key_info= table->s->key_info= keyinfo;
+    table->keys_in_use_for_query.set_bit(0);
+    share->keys_in_use.set_bit(0);
     keyinfo->key_part=key_part_info;
-    keyinfo->flags=HA_NOSAME;
+    keyinfo->flags=HA_NOSAME | HA_BINARY_PACK_KEY | HA_PACK_KEY;
     keyinfo->usable_key_parts=keyinfo->key_parts= param->group_parts;
     keyinfo->key_length=0;
     keyinfo->rec_per_key=0;
@@ -12002,6 +13949,8 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       bool maybe_null=(*cur_group->item)->maybe_null;
       key_part_info->null_bit=0;
       key_part_info->field=  field;
+      if (cur_group == group)
+        field->key_start.set_bit(0);
       key_part_info->offset= field->offset(table->record[0]);
       key_part_info->length= (uint16) field->key_length();
       key_part_info->type=   (uint8) field->key_type();
@@ -12014,12 +13963,26 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       if (!using_unique_constraint)
       {
 	cur_group->buff=(char*) group_buff;
+
+        if (maybe_null && !field->null_bit)
+        {
+          /*
+            This can only happen in the unusual case where an outer join
+            table was found to be not-nullable by the optimizer and we
+            the item can't really be null.
+            We solve this by marking the item as !maybe_null to ensure
+            that the key,field and item definition match.
+          */
+          (*cur_group->item)->maybe_null= maybe_null= 0;
+        }
+
 	if (!(cur_group->field= field->new_key_field(thd->mem_root,table,
                                                      group_buff +
                                                      test(maybe_null),
                                                      field->null_ptr,
                                                      field->null_bit)))
 	  goto err; /* purecov: inspected */
+
 	if (maybe_null)
 	{
 	  /*
@@ -12041,6 +14004,12 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
       }
       keyinfo->key_length+=  key_part_info->length;
     }
+    /*
+      Ensure we didn't overrun the group buffer. The < is only true when
+      some maybe_null fields was changed to be not null fields.
+    */
+    DBUG_ASSERT(using_unique_constraint ||
+                group_buff <= param->group_buff + param->group_length);
   }
 
   if (distinct && field_count != param->hidden_field_count)
@@ -12060,7 +14029,6 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
         indexes on blobs with arbitrary length. Such indexes cannot be
         used for lookups.
       */
-      //// psergey-merge: using_unique_constraint=1;
       share->uniques= 1;
     }
     null_pack_length-=hidden_null_pack_length;
@@ -12073,9 +14041,11 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
                      keyinfo->key_parts * sizeof(KEY_PART_INFO))))
       goto err;
     bzero((void*) key_part_info, keyinfo->key_parts * sizeof(KEY_PART_INFO));
+    table->keys_in_use_for_query.set_bit(0);
+    share->keys_in_use.set_bit(0);
     table->key_info= table->s->key_info= keyinfo;
     keyinfo->key_part=key_part_info;
-    keyinfo->flags=HA_NOSAME | HA_NULL_ARE_EQUAL;
+    keyinfo->flags=HA_NOSAME | HA_NULL_ARE_EQUAL | HA_BINARY_PACK_KEY | HA_PACK_KEY;
     keyinfo->key_length= 0;  // Will compute the sum of the parts below.
     keyinfo->name= (char*) "distinct_key";
     keyinfo->algorithm= HA_KEY_ALG_UNDEF;
@@ -12109,8 +14079,14 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
 	 i < field_count;
 	 i++, reg_field++, key_part_info++)
     {
-      key_part_info->null_bit=0;
       key_part_info->field=    *reg_field;
+      (*reg_field)->flags |= PART_KEY_FLAG;
+      if (key_part_info == keyinfo->key_part)
+        (*reg_field)->key_start.set_bit(0);
+      key_part_info->null_bit= (*reg_field)->null_bit;
+      key_part_info->null_offset= (uint) ((*reg_field)->null_ptr -
+                                          (uchar*) table->record[0]);
+
       key_part_info->offset=   (*reg_field)->offset(table->record[0]);
       key_part_info->length=   (uint16) (*reg_field)->pack_length();
       /* TODO:
@@ -12143,16 +14119,21 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
   if (thd->is_fatal_error)				// If end of memory
     goto err;					 /* purecov: inspected */
   share->db_record_offset= 1;
-  if (share->db_type() == TMP_ENGINE_HTON)
+  table->used_for_duplicate_elimination= (param->sum_func_count == 0 &&
+                                          (table->group || table->distinct));
+
+  if (!do_not_open)
   {
-    if (create_internal_tmp_table(table, param->keyinfo, param->start_recinfo,
-                                  &param->recinfo, select_options,
-                                  thd->variables.big_tables))
+    if (share->db_type() == TMP_ENGINE_HTON)
+    {
+      if (create_internal_tmp_table(table, param->keyinfo, param->start_recinfo,
+                                    &param->recinfo, select_options,
+                                    thd->variables.big_tables))
+        goto err;
+    }
+    if (open_tmp_table(table))
       goto err;
   }
-  DBUG_PRINT("info", ("skip_create_table: %d", (int)param->skip_create_table));
-  if (!param->skip_create_table && open_tmp_table(table))
-    goto err;
 
   thd->mem_root= mem_root_save;
 
@@ -12208,7 +14189,7 @@ TABLE *create_virtual_tmp_table(THD *thd, List<Create_field> &field_list)
                         &share, sizeof(*share),
                         &field, (field_count + 1) * sizeof(Field*),
                         &blob_field, (field_count+1) *sizeof(uint),
-                        &bitmaps, bitmap_buffer_size(field_count)*3,
+                        &bitmaps, bitmap_buffer_size(field_count)*4,
                         NullS))
     return 0;
 
@@ -12220,7 +14201,6 @@ TABLE *create_virtual_tmp_table(THD *thd, List<Create_field> &field_list)
   share->blob_field= blob_field;
   share->fields= field_count;
   share->blob_ptr_size= portable_sizeof_char_ptr;
-  share->db_low_byte_first=1;                // True for HEAP and MyISAM
   setup_tmp_table_column_bitmaps(table, bitmaps);
 
   /* Create all fields and calculate the total length of record */
@@ -12321,7 +14301,9 @@ bool open_tmp_table(TABLE *table)
     table->db_stat=0;
     return(1);
   }
+  table->db_stat= HA_OPEN_KEYFILE+HA_OPEN_RNDFILE;
   (void) table->file->extra(HA_EXTRA_QUICK);		/* Faster */
+  table->created= TRUE;
   return(0);
 }
 
@@ -12451,10 +14433,23 @@ bool create_internal_tmp_table(TABLE *table, KEY *keyinfo,
   if (big_tables && !(options & SELECT_SMALL_RESULT))
     create_info.data_file_length= ~(ulonglong) 0;
 
+  /*
+    The logic for choosing the record format:
+    The STATIC_RECORD format is the fastest one, because it's so simple,
+    so we use this by default for short rows.
+    BLOCK_RECORD caches both row and data, so this is generally faster than
+    DYNAMIC_RECORD. The one exception is when we write to tmp table and
+    want to use keys for duplicate elimination as with BLOCK RECORD
+    we first write the row, then check for key conflicts and then we have to
+    delete the row.  The cases when this can happen is when there is
+    a group by and no sum functions or if distinct is used.
+  */
   if ((error= maria_create(share->table_name.str,
-                           share->reclength < 64 &&
-                           !share->blob_fields ? STATIC_RECORD :
-                           BLOCK_RECORD,
+                           table->no_rows ? NO_RECORD :
+                           (share->reclength < 64 &&
+                            !share->blob_fields ? STATIC_RECORD :
+                            table->used_for_duplicate_elimination ?
+                            DYNAMIC_RECORD : BLOCK_RECORD),
                            share->keys, &keydef,
                            (uint) (*recinfo-start_recinfo),
                            start_recinfo,
@@ -12627,6 +14622,7 @@ bool create_internal_tmp_table(TABLE *table, KEY *keyinfo,
   }
   status_var_increment(table->in_use->status_var.created_tmp_disk_tables);
   share->db_record_offset= 1;
+  table->created= TRUE;
   DBUG_RETURN(0);
  err:
   DBUG_RETURN(1);
@@ -12671,7 +14667,7 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   TABLE new_table;
   TABLE_SHARE share;
   const char *save_proc_info;
-  int write_err;
+  int write_err= 0;
   DBUG_ENTER("create_internal_tmp_table_from_heap2");
 
   if (table->s->db_type() != heap_hton || 
@@ -12695,8 +14691,10 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   save_proc_info=thd->proc_info;
   thd_proc_info(thd, proc_info);
 
+  new_table.no_rows= table->no_rows;
   if (create_internal_tmp_table(&new_table, table->key_info, start_recinfo,
-                                recinfo, thd->lex->select_lex.options | 
+                                recinfo,
+                                thd->lex->select_lex.options | 
 			        thd->variables.option_bits,
                                 thd->variables.big_tables))
     goto err2;
@@ -12707,24 +14705,15 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   table->file->ha_index_or_rnd_end();
   if (table->file->ha_rnd_init_with_error(1))
     DBUG_RETURN(1);
-  if (table->no_rows)
-  {
+  if (new_table.no_rows)
     new_table.file->extra(HA_EXTRA_NO_ROWS);
-    new_table.no_rows=1;
+  else
+  {
+    /* update table->file->stats.records */
+    table->file->info(HA_STATUS_VARIABLE);
+    new_table.file->ha_start_bulk_insert(table->file->stats.records);
   }
 
-#ifdef TO_BE_DONE_LATER_IN_4_1
-  /*
-    To use start_bulk_insert() (which is new in 4.1) we need to find
-    all places where a corresponding end_bulk_insert() should be put.
-  */
-  table->file->info(HA_STATUS_VARIABLE); /* update table->file->stats.records */
-  new_table.file->ha_start_bulk_insert(table->file->stats.records);
-#else
-  /* HA_EXTRA_WRITE_CACHE can stay until close, no need to disable it */
-  new_table.file->extra(HA_EXTRA_WRITE_CACHE);
-#endif
-
   /*
     copy all old rows from heap table to MyISAM table
     This is the only code that uses record[1] to read/write but this
@@ -12733,13 +14722,20 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   */
   while (!table->file->ha_rnd_next(new_table.record[1]))
   {
-    write_err= new_table.file->ha_write_row(new_table.record[1]);
+    write_err= new_table.file->ha_write_tmp_row(new_table.record[1]);
     DBUG_EXECUTE_IF("raise_error", write_err= HA_ERR_FOUND_DUPP_KEY ;);
     if (write_err)
       goto err;
+    if (thd->killed)
+    {
+      thd->send_kill_message();
+      goto err_killed;
+    }
   }
+  if (!new_table.no_rows && new_table.file->ha_end_bulk_insert())
+    goto err;
   /* copy row that filled HEAP table */
-  if ((write_err=new_table.file->ha_write_row(table->record[0])))
+  if ((write_err=new_table.file->ha_write_tmp_row(table->record[0])))
   {
     if (new_table.file->is_fatal_error(write_err, HA_CHECK_DUP) ||
 	!ignore_last_dupp_key_error)
@@ -12748,7 +14744,7 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
 
   /* remove heap table and change to use myisam table */
   (void) table->file->ha_rnd_end();
-  (void) table->file->close();                  // This deletes the table !
+  (void) table->file->ha_close();          // This deletes the table !
   delete table->file;
   table->file=0;
   plugin_unlock(0, table->s->db_plugin);
@@ -12760,15 +14756,16 @@ create_internal_tmp_table_from_heap2(THD *thd, TABLE *table,
   table->file->change_table_ptr(table, table->s);
   table->use_all_columns();
   if (save_proc_info)
-    thd_proc_info(thd, (!strcmp(save_proc_info,"Copying to tmp table") ?
-                     "Copying to tmp table on disk" : save_proc_info));
+    thd_proc_info(thd, save_proc_info == copy_to_tmp_table ?
+                  "Copying to tmp table on disk" : save_proc_info);
   DBUG_RETURN(0);
 
  err:
   DBUG_PRINT("error",("Got error: %d",write_err));
   table->file->print_error(write_err, MYF(0));
+err_killed:
   (void) table->file->ha_rnd_end();
-  (void) new_table.file->close();
+  (void) new_table.file->ha_close();
  err1:
   new_table.file->ha_delete_table(new_table.s->table_name.str);
  err2:
@@ -12785,12 +14782,12 @@ free_tmp_table(THD *thd, TABLE *entry)
   MEM_ROOT own_root= entry->mem_root;
   const char *save_proc_info;
   DBUG_ENTER("free_tmp_table");
-  DBUG_PRINT("enter",("table: %s",entry->alias));
+  DBUG_PRINT("enter",("table: %s",entry->alias.c_ptr()));
 
   save_proc_info=thd->proc_info;
   thd_proc_info(thd, "removing tmp table");
 
-  if (entry->file)
+  if (entry->file && entry->created)
   {
     if (entry->db_stat)
       entry->file->ha_drop_table(entry->s->table_name.str);
@@ -12808,6 +14805,7 @@ free_tmp_table(THD *thd, TABLE *entry)
     bitmap_lock_clear_bit(&temp_pool, entry->temp_pool_slot);
 
   plugin_unlock(0, entry->s->db_plugin);
+  entry->alias.free();
 
   free_root(&own_root, MYF(0)); /* the table is allocated in its own root */
   thd_proc_info(thd, save_proc_info);
@@ -12935,14 +14933,13 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
   }
   /* Set up select_end */
   Next_select_func end_select= setup_end_select_func(join);
-  if (join->tables)
+  if (join->table_count)
   {
-    join->join_tab[join->tables-1].next_select= end_select;
-
+    join->join_tab[join->top_join_tab_count - 1].next_select= end_select;
     join_tab=join->join_tab+join->const_tables;
   }
   join->send_records=0;
-  if (join->tables == join->const_tables)
+  if (join->table_count == join->const_tables)
   {
     /*
       HAVING will be checked after processing aggregate functions,
@@ -12968,7 +14965,7 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
       {
         List<Item> *columns_list= (procedure ? &join->procedure_fields_list :
                                    fields);
-        rc= join->result->send_data(*columns_list);
+        rc= join->result->send_data(*columns_list) > 0;
       }
     }
     /*
@@ -12981,17 +14978,19 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure)
   }
   else
   {
-    DBUG_ASSERT(join->tables);
-    error= join->first_select(join,join_tab,0);
+    DBUG_ASSERT(join->table_count);
+    if (join->outer_ref_cond && !join->outer_ref_cond->val_int())
+      error= NESTED_LOOP_NO_MORE_ROWS;
+    else
+      error= sub_select(join,join_tab,0);
     if (error == NESTED_LOOP_OK || error == NESTED_LOOP_NO_MORE_ROWS)
-      error= join->first_select(join,join_tab,1);
+      error= sub_select(join,join_tab,1);
     if (error == NESTED_LOOP_QUERY_LIMIT)
       error= NESTED_LOOP_OK;                    /* select_limit used */
   }
   if (error == NESTED_LOOP_NO_MORE_ROWS)
     error= NESTED_LOOP_OK;
 
-
   if (table)
   {
     int tmp, new_errno= 0;
@@ -13055,134 +15054,6 @@ int rr_sequential_and_unpack(READ_RECORD *info)
 
 
 /*
-  Semi-join materialization join function
-
-  SYNOPSIS
-    sub_select_sjm()
-      join            The join
-      join_tab        The first table in the materialization nest
-      end_of_records  FALSE <=> This call is made to pass another record 
-                                combination
-                      TRUE  <=> EOF
-
-  DESCRIPTION
-    This is a join execution function that does materialization of a join
-    suborder before joining it to the rest of the join.
-
-    The table pointed by join_tab is the first of the materialized tables.
-    This function first creates the materialized table and then switches to
-    joining the materialized table with the rest of the join.
-
-    The materialized table can be accessed in two ways:
-     - index lookups
-     - full table scan
-
-  RETURN
-    One of enum_nested_loop_state values
-*/
-
-enum_nested_loop_state
-sub_select_sjm(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
-{
-  int res;
-  enum_nested_loop_state rc;
-
-  DBUG_ENTER("sub_select_sjm");
-
-  if (!join_tab->emb_sj_nest)
-  {
-    /*
-      We're handling GROUP BY/ORDER BY, this is the first table, and we've
-      actually executed the join already and now we're just reading the
-      result of the join from the temporary table.
-      Bypass to regular join handling.
-      Yes, it would be nicer if sub_select_sjm wasn't called at all in this
-      case but there's no easy way to arrange this.
-    */
-    rc= sub_select(join, join_tab, end_of_records);
-    DBUG_RETURN(rc);
-  }
-
-  SJ_MATERIALIZATION_INFO *sjm= join_tab->emb_sj_nest->sj_mat_info;
-  if (end_of_records)
-  {
-    rc= (*join_tab[sjm->tables - 1].next_select)(join,
-                                                 join_tab + sjm->tables,
-                                                 end_of_records);
-    DBUG_RETURN(rc);
-  }
-  if (!sjm->materialized)
-  {
-    /*
-      Do the materialization. First, put end_sj_materialize after the last
-      inner table so we can catch record combinations of sj-inner tables.
-    */
-    Next_select_func next_func= join_tab[sjm->tables - 1].next_select;
-    join_tab[sjm->tables - 1].next_select= end_sj_materialize;
-
-    /*
-      Now run the join for the inner tables. The first call is to run the
-      join, the second one is to signal EOF (this is essential for some
-      join strategies, e.g. it will make join buffering flush the records)
-    */
-    if ((rc= sub_select(join, join_tab, FALSE)) < 0 ||
-        (rc= sub_select(join, join_tab, TRUE/*EOF*/)) < 0)
-    {
-      join_tab[sjm->tables - 1].next_select= next_func;
-      DBUG_RETURN(rc); /* it's NESTED_LOOP_(ERROR|KILLED)*/
-    }
-    join_tab[sjm->tables - 1].next_select= next_func;
-
-    /*
-      Ok, materialization finished. Initialize the access to the temptable
-    */
-    sjm->materialized= TRUE;
-    join_tab->read_record.read_record= join_no_more_records;
-    if (sjm->is_sj_scan)
-    {
-      /* Initialize full scan */
-      JOIN_TAB *last_tab= join_tab + (sjm->tables - 1);
-      init_read_record(&last_tab->read_record, join->thd,
-                       sjm->table, NULL, TRUE, TRUE, FALSE);
-
-      DBUG_ASSERT(last_tab->read_record.read_record == rr_sequential);
-      last_tab->read_first_record= join_read_record_no_init;
-      last_tab->read_record.copy_field= sjm->copy_field;
-      last_tab->read_record.copy_field_end= sjm->copy_field +
-                                            sjm->sjm_table_cols.elements;
-      last_tab->read_record.read_record= rr_sequential_and_unpack;
-    }
-  }
-
-  if (sjm->is_sj_scan)
-  {
-    /* Do full scan of the materialized table */
-    JOIN_TAB *last_tab= join_tab + (sjm->tables - 1);
-
-    Item *save_cond= last_tab->select_cond;
-    last_tab->set_select_cond(sjm->join_cond, __LINE__);
-
-    rc= sub_select(join, last_tab, end_of_records);
-    last_tab->set_select_cond(save_cond, __LINE__);
-    DBUG_RETURN(rc);
-  }
-  else
-  {
-    /* Do index lookup in the materialized table */
-    if ((res= join_read_key2(join_tab->join->thd, join_tab,
-                             sjm->table, sjm->tab_ref)) == 1)
-      DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
-    if (res || !sjm->in_equality->val_int())
-      DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
-  }
-  rc= (*join_tab[sjm->tables - 1].next_select)(join,
-                                               join_tab + sjm->tables,
-                                               end_of_records);
-  DBUG_RETURN(rc);
-}
-
-
-/*
   Fill the join buffer with partial records, retrieve all full  matches for them   
 
   SYNOPSIS
@@ -13396,7 +15267,7 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
     DBUG_RETURN(nls);
   }
   int error;
-  enum_nested_loop_state rc;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
   READ_RECORD *info= &join_tab->read_record;
 
   if (join_tab->flush_weedout_table)
@@ -13404,6 +15275,9 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
     do_sj_reset(join_tab->flush_weedout_table);
   }
 
+  if (!join_tab->preread_init_done && join_tab->preread_init())
+    DBUG_RETURN(NESTED_LOOP_ERROR);
+
   join->return_tab= join_tab;
 
   if (join_tab->last_inner)
@@ -13416,19 +15290,26 @@ sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
 
     /* Set first_unmatched for the last inner table of this group */
     join_tab->last_inner->first_unmatched= join_tab;
+    if (join_tab->on_precond && !join_tab->on_precond->val_int())
+      rc= NESTED_LOOP_NO_MORE_ROWS;
   }
   join->thd->warning_info->reset_current_row_for_warning();
+
+  if (rc != NESTED_LOOP_NO_MORE_ROWS && 
+      (rc= join_tab_execution_startup(join_tab)) < 0)
+    DBUG_RETURN(rc);
   
   if (join_tab->loosescan_match_tab)
     join_tab->loosescan_match_tab->found_match= FALSE;
 
-  error= (*join_tab->read_first_record)(join_tab);
-
-  if (join_tab->keep_current_rowid)
-    join_tab->table->file->position(join_tab->table->record[0]);
+  if (rc != NESTED_LOOP_NO_MORE_ROWS)
+  {
+    error= (*join_tab->read_first_record)(join_tab);
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);    
+    rc= evaluate_join_record(join, join_tab, error);
+  }
 
-  rc= evaluate_join_record(join, join_tab, error);
-  
   /* 
     Note: psergey has added the 2nd part of the following condition; the 
     change should probably be made in 5.1, too.
@@ -13573,7 +15454,9 @@ evaluate_join_record(JOIN *join, JOIN_TAB *join_tab,
           /* The condition attached to table tab is false */
 
           if (tab == join_tab)
+          {
             found= 0;
+          }            
           else
           {
             /*
@@ -13738,33 +15621,32 @@ evaluate_null_complemented_join_record(JOIN *join, JOIN_TAB *join_tab)
   /*
     The row complemented by nulls satisfies all conditions
     attached to inner tables.
+  */
+  if (join_tab->check_weed_out_table)
+  {
+    int res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table);
+    if (res == -1)
+      return NESTED_LOOP_ERROR;
+    else if (res == 1)
+      return NESTED_LOOP_OK;
+  }
+  else if (join_tab->do_firstmatch)
+  {
+    /* 
+      We should return to the join_tab->do_firstmatch after we have 
+      enumerated all the suffixes for current prefix row combination
+    */
+    if (join_tab->do_firstmatch < join->return_tab)
+      join->return_tab= join_tab->do_firstmatch;
+  }
+
+  /*
     Send the row complemented by nulls to be joined with the
     remaining tables.
   */
   return (*join_tab->next_select)(join, join_tab+1, 0);
 }
 
-#ifdef MERGE_JUNK
-//psergey3-merge: remove:
-  SQL_SELECT *select;
-  select= join_tab->select;
-
-    int err= 0;
-         (err= join_tab->cache.select->skip_record(join->thd)) != 0 ))
-      {
-        reset_cache_write(&join_tab->cache);
-        return NESTED_LOOP_ERROR;
-      }
-
-	if (!select || (err= select->skip_record(join->thd)) != 0)
-          if (err < 0)
-          {
-            reset_cache_write(&join_tab->cache);
-            return NESTED_LOOP_ERROR;
-          }
-      
-    rc= NESTED_LOOP_OK;
-#endif
 /*****************************************************************************
   The different ways to read a record
   Returns -1 if row was not found, 0 if row was found and 1 on errors
@@ -13808,13 +15690,21 @@ static int
 join_read_const_table(JOIN_TAB *tab, POSITION *pos)
 {
   int error;
+  TABLE_LIST *tbl;
   DBUG_ENTER("join_read_const_table");
   TABLE *table=tab->table;
   table->const_table=1;
   table->null_row=0;
   table->status=STATUS_NO_RECORD;
   
-  if (tab->type == JT_SYSTEM)
+  if (tab->table->pos_in_table_list->is_materialized_derived() &&
+      !tab->table->pos_in_table_list->fill_me)
+  {
+    //TODO: don't get here at all
+    /* Skip materialized derived tables/views. */
+    DBUG_RETURN(0);
+  }
+  else if (tab->type == JT_SYSTEM)
   {
     if ((error=join_read_system(tab)))
     {						// Info for DESCRIBE
@@ -13852,7 +15742,14 @@ join_read_const_table(JOIN_TAB *tab, POSITION *pos)
 	DBUG_RETURN(error);
     }
   }
-  if (*tab->on_expr_ref && !table->null_row)
+  /* 
+     Evaluate an on-expression only if it is not considered expensive.
+     This mainly prevents executing subqueries in optimization phase.
+     This is necessary since proper setup for such execution has not been
+     done at this stage.
+  */
+  if (*tab->on_expr_ref && !table->null_row && 
+      !(*tab->on_expr_ref)->is_expensive())
   {
 #if !defined(DBUG_OFF) && defined(NOT_USING_ITEM_EQUAL)
     /*
@@ -13871,26 +15768,27 @@ join_read_const_table(JOIN_TAB *tab, POSITION *pos)
   if (!table->null_row)
     table->maybe_null=0;
 
-  /* Check appearance of new constant items in Item_equal objects */
-  JOIN *join= tab->join;
-  if (join->conds)
-    update_const_equal_items(join->conds, tab);
-  TABLE_LIST *tbl;
-  for (tbl= join->select_lex->leaf_tables; tbl; tbl= tbl->next_leaf)
   {
-    TABLE_LIST *embedded;
-    TABLE_LIST *embedding= tbl;
-    do
-    {
-      embedded= embedding;
-      if (embedded->on_expr)
-         update_const_equal_items(embedded->on_expr, tab);
-      embedding= embedded->embedding;
+    JOIN *join= tab->join;
+    List_iterator<TABLE_LIST> ti(join->select_lex->leaf_tables);
+    /* Check appearance of new constant items in Item_equal objects */
+    if (join->conds)
+      update_const_equal_items(join->conds, tab);
+    while ((tbl= ti++))
+    {
+      TABLE_LIST *embedded;
+      TABLE_LIST *embedding= tbl;
+      do
+      {
+        embedded= embedding;
+        if (embedded->on_expr)
+           update_const_equal_items(embedded->on_expr, tab);
+        embedding= embedded->embedding;
+      }
+      while (embedding &&
+             embedding->nested_join->join_list.head() == embedded);
     }
-    while (embedding &&
-           embedding->nested_join->join_list.head() == embedded);
   }
-
   DBUG_RETURN(0);
 }
 
@@ -14103,13 +16001,6 @@ join_read_always_key(JOIN_TAB *tab)
     }
   }
 
-  /* Perform "Late NULLs Filtering" (see internals manual for explanations) */
-  for (uint i= 0 ; i < tab->ref.key_parts ; i++)
-  {
-    if ((tab->ref.null_rejecting & 1 << i) && tab->ref.items[i]->is_null())
-        return -1;
-  }
-
   if (cp_buffer_from_ref(tab->join->thd, table, &tab->ref))
     return -1;
   if ((error= table->file->ha_index_read_map(table->record[0],
@@ -14243,19 +16134,36 @@ int join_init_read_record(JOIN_TAB *tab)
 {
   if (tab->select && tab->select->quick && tab->select->quick->reset())
     return 1;
+  if (!tab->preread_init_done && tab->preread_init())
+    return 1;
   if (init_read_record(&tab->read_record, tab->join->thd, tab->table,
                        tab->select,1,1, FALSE))
     return 1;
   return (*tab->read_record.read_record)(&tab->read_record);
 }
 
-static int
+int
 join_read_record_no_init(JOIN_TAB *tab)
 {
+  Copy_field *save_copy, *save_copy_end;
+  
+  /*
+    init_read_record resets all elements of tab->read_record().
+    Remember things that we don't want to have reset.
+  */
+  save_copy=     tab->read_record.copy_field;
+  save_copy_end= tab->read_record.copy_field_end;
+  
+  init_read_record(&tab->read_record, tab->join->thd, tab->table,
+		   tab->select,1,1, FALSE);
+
+  tab->read_record.copy_field=     save_copy;
+  tab->read_record.copy_field_end= save_copy_end;
+  tab->read_record.read_record= rr_sequential_and_unpack;
+
   return (*tab->read_record.read_record)(&tab->read_record);
 }
 
-
 static int
 join_read_first(JOIN_TAB *tab)
 {
@@ -14428,8 +16336,7 @@ end_send(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   DBUG_ENTER("end_send");
   if (!end_of_records)
   {
-    int error;
-    if (join->tables &&
+    if (join->table_count &&
         join->join_tab->is_using_loose_index_scan())
     {
       /* Copy non-aggregated fields when loose index scan is used. */
@@ -14443,18 +16350,20 @@ end_send(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
         DBUG_RETURN(NESTED_LOOP_ERROR);
       DBUG_RETURN(NESTED_LOOP_OK);
     }
-    error=0;
     if (join->do_send_rows)
-      error=join->result->send_data(*join->fields);
-    if (error)
-      DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    {
+      int error;
+      /* result < 0 if row was not accepted and should not be counted */
+      if ((error= join->result->send_data(*join->fields)))
+        DBUG_RETURN(error < 0 ? NESTED_LOOP_OK : NESTED_LOOP_ERROR);
+    }
     if (++join->send_records >= join->unit->select_limit_cnt &&
 	join->do_send_rows)
     {
       if (join->select_options & OPTION_FOUND_ROWS)
       {
 	JOIN_TAB *jt=join->join_tab;
-	if ((join->tables == 1) && !join->tmp_table && !join->sort_and_group
+	if ((join->table_count == 1) && !join->tmp_table && !join->sort_and_group
 	    && !join->send_group_parts && !join->having && !jt->select_cond &&
 	    !(jt->select && jt->select->quick) &&
 	    (jt->table->file->ha_table_flags() & HA_STATS_RECORDS_IS_EXACT) &&
@@ -14557,7 +16466,15 @@ end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 	  else
 	  {
 	    if (join->do_send_rows)
-	      error=join->result->send_data(*join->fields) ? 1 : 0;
+            {
+	      error= join->result->send_data(*join->fields);
+              if (error < 0)
+              {
+                /* Duplicate row, don't count */
+                join->send_records--;
+                error= 0;
+              }
+            }
 	    join->send_records++;
 	  }
 	  if (join->rollup.state != ROLLUP::STATE_NONE && error <= 0)
@@ -14644,7 +16561,7 @@ end_write(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
     {
       int error;
       join->found_records++;
-      if ((error= table->file->ha_write_row(table->record[0])))
+      if ((error= table->file->ha_write_tmp_row(table->record[0])))
       {
         if (!table->file->is_fatal_error(error, HA_CHECK_DUP))
 	  goto end;
@@ -14708,8 +16625,8 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   {						/* Update old record */
     restore_record(table,record[1]);
     update_tmptable_sum_func(join->sum_funcs,table);
-    if ((error= table->file->ha_update_row(table->record[1],
-                                           table->record[0])))
+    if ((error= table->file->ha_update_tmp_row(table->record[1],
+                                               table->record[0])))
     {
       table->file->print_error(error,MYF(0));	/* purecov: inspected */
       DBUG_RETURN(NESTED_LOOP_ERROR);            /* purecov: inspected */
@@ -14733,7 +16650,7 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   init_tmptable_sum_functions(join->sum_funcs);
   if (copy_funcs(join->tmp_table_param.items_to_copy, join->thd))
     DBUG_RETURN(NESTED_LOOP_ERROR);           /* purecov: inspected */
-  if ((error= table->file->ha_write_row(table->record[0])))
+  if ((error= table->file->ha_write_tmp_row(table->record[0])))
   {
     if (create_internal_tmp_table_from_heap(join->thd, table,
                                             join->tmp_table_param.start_recinfo,
@@ -14746,7 +16663,7 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
       table->file->print_error(error, MYF(0));/* purecov: inspected */
       DBUG_RETURN(NESTED_LOOP_ERROR);         /* purecov: inspected */
     }
-    join->join_tab[join->tables-1].next_select=end_unique_update;
+    join->join_tab[join->top_join_tab_count-1].next_select=end_unique_update;
   }
   join->send_records++;
   DBUG_RETURN(NESTED_LOOP_OK);
@@ -14776,7 +16693,7 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
   if (copy_funcs(join->tmp_table_param.items_to_copy, join->thd))
     DBUG_RETURN(NESTED_LOOP_ERROR);           /* purecov: inspected */
 
-  if (!(error= table->file->ha_write_row(table->record[0])))
+  if (!(error= table->file->ha_write_tmp_row(table->record[0])))
     join->send_records++;			// New group
   else
   {
@@ -14792,8 +16709,8 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
     }
     restore_record(table,record[1]);
     update_tmptable_sum_func(join->sum_funcs,table);
-    if ((error= table->file->ha_update_row(table->record[1],
-                                           table->record[0])))
+    if ((error= table->file->ha_update_tmp_row(table->record[1],
+                                               table->record[0])))
     {
       table->file->print_error(error,MYF(0));	/* purecov: inspected */
       DBUG_RETURN(NESTED_LOOP_ERROR);            /* purecov: inspected */
@@ -14836,7 +16753,7 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
                        join->sum_funcs_end[send_group_parts]);
 	if (!join->having || join->having->val_int())
 	{
-          int error= table->file->ha_write_row(table->record[0]);
+          int error= table->file->ha_write_tmp_row(table->record[0]);
           if (error && 
               create_internal_tmp_table_from_heap(join->thd, table,
                                                   join->tmp_table_param.start_recinfo,
@@ -14889,8 +16806,26 @@ end_write_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 *****************************************************************************/
 
 /**
-  @return
-    1 if right_item is used removable reference key on left_item
+  Check if "left_item=right_item" equality is guaranteed to be true by use of
+  [eq]ref access on left_item->field->table.
+
+  SYNOPSIS
+    test_if_ref()
+      root_cond
+      left_item
+      right_item
+
+  DESCRIPTION
+    Check if the given "left_item = right_item" equality is guaranteed to be
+    true by use of [eq_]ref access method.
+
+    We need root_cond as we can't remove ON expressions even if employed ref 
+    access guarantees that they are true. This is because  TODO
+
+  RETURN
+    TRUE   if right_item is used removable reference key on left_item
+    FALSE  Otherwise
+    
 */
 
 bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
@@ -14899,10 +16834,15 @@ bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
   JOIN_TAB *join_tab= field->table->reginfo.join_tab;
   // No need to change const test
   if (!field->table->const_table && join_tab &&
+      !join_tab->is_ref_for_hash_join() &&
       (!join_tab->first_inner ||
        *join_tab->first_inner->on_expr_ref == root_cond))
   {
-    // Cond guards
+    /*
+      If ref access uses "Full scan on NULL key" (i.e. it actually alternates
+      between ref access and full table scan), then no equality can be
+      guaranteed to be true.
+    */
     for (uint i = 0; i < join_tab->ref.key_parts; i++)
     {
       if (join_tab->ref.cond_guards[i])
@@ -14910,9 +16850,10 @@ bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
         return FALSE;
       }
     }
-    //
+
     Item *ref_item=part_of_refkey(field->table,field);
-    if (ref_item && ref_item->eq(right_item,1))
+    if (ref_item && (ref_item->eq(right_item,1) || 
+		     ref_item->real_item()->eq(right_item,1)))
     {
       right_item= right_item->real_item();
       if (right_item->type() == Item::FIELD_ITEM)
@@ -14924,7 +16865,7 @@ bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
       {
 	/*
 	  We can remove binary fields and numerical fields except float,
-	  as float comparison isn't 100 % secure
+	  as float comparison isn't 100 % safe
 	  We have to keep normal strings to be able to check for end spaces
 	*/
 	if (field->binary() &&
@@ -14940,14 +16881,24 @@ bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
   return 0;					// keep test
 }
 
+
 /**
    Extract a condition that can be checked after reading given table
+   @fn make_cond_for_table()
 
    @param cond       Condition to analyze
    @param tables     Tables for which "current field values" are available
-   @param used_table Table that we're extracting the condition for (may
-                     also include PSEUDO_TABLE_BITS, and may be zero)
-   @param exclude_expensive_cond  Do not push expensive conditions
+   @param used_table Table that we're extracting the condition for
+      tables       Tables for which "current field values" are available (this
+                   includes used_table)
+                   (may  also include PSEUDO_TABLE_BITS, and may be zero)
+   @param join_tab_idx_arg
+		     The index of the JOIN_TAB this Item is being extracted
+                     for. MAX_TABLES if there is no corresponding JOIN_TAB.
+   @param exclude_expensive_cond
+		     Do not push expensive conditions
+   @param retain_ref_cond
+                     Retain ref conditions
 
    @retval <>NULL Generated condition
    @retval =NULL  Already checked, OR error
@@ -14977,68 +16928,32 @@ bool test_if_ref(Item *root_cond, Item_field *left_item,Item *right_item)
      make_cond_for_info_schema() uses similar algorithm as well.
 */ 
 
-
-/*
-  Extract a condition that can be checked after reading given table
-  
-  SYNOPSIS
-    make_cond_for_table()
-      cond         Condition to analyze
-      tables       Tables for which "current field values" are available
-      used_table   Table that we're extracting the condition for (may 
-                   also include PSEUDO_TABLE_BITS
-      exclude_expensive_cond  Do not push expensive conditions
-
-  DESCRIPTION
-    Extract the condition that can be checked after reading the table
-    specified in 'used_table', given that current-field values for tables
-    specified in 'tables' bitmap are available.
-
-    The function assumes that
-      - Constant parts of the condition has already been checked.
-      - Condition that could be checked for tables in 'tables' has already 
-        been checked.
-        
-    The function takes into account that some parts of the condition are
-    guaranteed to be true by employed 'ref' access methods (the code that
-    does this is located at the end, search down for "EQ_FUNC").
-
-
-  SEE ALSO 
-    make_cond_for_info_schema uses similar algorithm
-
-  RETURN
-    Extracted condition
-*/
-
 static Item *
-make_cond_for_table(Item *cond, table_map tables, table_map used_table,
-                    bool exclude_expensive_cond)
+make_cond_for_table(THD *thd, Item *cond, table_map tables,
+                    table_map used_table,
+                    uint join_tab_idx_arg,
+                    bool exclude_expensive_cond __attribute__((unused)),
+		    bool retain_ref_cond)
 {
-  return make_cond_for_table_from_pred(cond, cond, tables, used_table,
-                                       exclude_expensive_cond);
+  return make_cond_for_table_from_pred(thd, cond, cond, tables, used_table,
+                                       join_tab_idx_arg,
+                                       exclude_expensive_cond,
+                                       retain_ref_cond);
 }
-               
+
+
 static Item *
-make_cond_for_table_from_pred(Item *root_cond, Item *cond,
+make_cond_for_table_from_pred(THD *thd, Item *root_cond, Item *cond,
                               table_map tables, table_map used_table,
-                              bool exclude_expensive_cond)
+                              uint join_tab_idx_arg,
+                              bool exclude_expensive_cond __attribute__
+                              ((unused)),
+                              bool retain_ref_cond)
 
 {
-  if (used_table && !(cond->used_tables() & used_table) &&
-      /*
-        Exclude constant conditions not checked at optimization time if
-        the table we are pushing conditions to is the first one.
-        As a result, such conditions are not considered as already checked
-        and will be checked at execution time, attached to the first table.
-
-        psergey: TODO: "used_table & 1" doesn't make sense in nearly any
-        context. Look at setup_table_map(), table bits reflect the order 
-        the tables were encountered by the parser. Check what we should
-        replace this condition with.
-      */
-      !((used_table & 1) && cond->is_expensive()))
+  if (used_table && !(cond->used_tables() & used_table))
     return (COND*) 0;				// Already checked
+
   if (cond->type() == Item::COND_ITEM)
   {
     if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
@@ -15051,9 +16966,11 @@ make_cond_for_table_from_pred(Item *root_cond, Item *cond,
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table_from_pred(root_cond, item, 
+	Item *fix=make_cond_for_table_from_pred(thd, root_cond, item, 
                                                 tables, used_table,
-                                                exclude_expensive_cond);
+						join_tab_idx_arg,
+                                                exclude_expensive_cond,
+                                                retain_ref_cond);
 	if (fix)
 	  new_cond->argument_list()->push_back(fix);
       }
@@ -15064,10 +16981,11 @@ make_cond_for_table_from_pred(Item *root_cond, Item *cond,
 	return new_cond->argument_list()->head();
       default:
 	/*
-	  Item_cond_and do not need fix_fields for execution, its parameters
-	  are fixed or do not need fix_fields, too
+          Call fix_fields to propagate all properties of the children to
+          the new parent Item. This should not be expensive because all
+	  children of Item_cond_and should be fixed by now.
 	*/
-	new_cond->quick_fix_field();
+	new_cond->fix_fields(thd, 0);
 	new_cond->used_tables_cache=
 	  ((Item_cond_and*) cond)->used_tables_cache &
 	  tables;
@@ -15083,18 +17001,21 @@ make_cond_for_table_from_pred(Item *root_cond, Item *cond,
       Item *item;
       while ((item=li++))
       {
-	Item *fix=make_cond_for_table_from_pred(root_cond, item,
+	Item *fix=make_cond_for_table_from_pred(thd, root_cond, item,
                                                 tables, 0L,
-                                                exclude_expensive_cond);
+                                                join_tab_idx_arg,
+                                                exclude_expensive_cond,
+                                                retain_ref_cond);
 	if (!fix)
 	  return (COND*) 0;			// Always true
 	new_cond->argument_list()->push_back(fix);
       }
       /*
-	Item_cond_and do not need fix_fields for execution, its parameters
-	are fixed or do not need fix_fields, too
+        Call fix_fields to propagate all properties of the children to
+        the new parent Item. This should not be expensive because all
+        children of Item_cond_and should be fixed by now.
       */
-      new_cond->quick_fix_field();
+      new_cond->fix_fields(thd, 0);
       new_cond->used_tables_cache= ((Item_cond_or*) cond)->used_tables_cache;
       new_cond->top_level_item();
       return new_cond;
@@ -15106,28 +17027,28 @@ make_cond_for_table_from_pred(Item *root_cond, Item *cond,
     table_count times, we mark each item that we have examined with the result
     of the test
   */
-  if (cond->marker == 3 || (cond->used_tables() & ~tables) ||
-      /*
-        When extracting constant conditions, treat expensive conditions as
-        non-constant, so that they are not evaluated at optimization time.
-      */
-      (!used_table && exclude_expensive_cond && cond->is_expensive()))
+  if ((cond->marker == 3 && !retain_ref_cond) ||
+      (cond->used_tables() & ~tables))
     return (COND*) 0;				// Can't check this yet
+
   if (cond->marker == 2 || cond->eq_cmp_result() == Item::COND_OK)
+  {
+    cond->set_join_tab_idx(join_tab_idx_arg);
     return cond;				// Not boolean op
+  }
 
   if (cond->type() == Item::FUNC_ITEM && 
       ((Item_func*) cond)->functype() == Item_func::EQ_FUNC)
   {
     Item *left_item=	((Item_func*) cond)->arguments()[0]->real_item();
     Item *right_item= ((Item_func*) cond)->arguments()[1]->real_item();
-    if (left_item->type() == Item::FIELD_ITEM &&
+    if (left_item->type() == Item::FIELD_ITEM && !retain_ref_cond &&
 	test_if_ref(root_cond, (Item_field*) left_item,right_item))
     {
       cond->marker=3;			// Checked when read
       return (COND*) 0;
     }
-    if (right_item->type() == Item::FIELD_ITEM &&
+    if (right_item->type() == Item::FIELD_ITEM && !retain_ref_cond &&
 	test_if_ref(root_cond, (Item_field*) right_item,left_item))
     {
       cond->marker=3;			// Checked when read
@@ -15135,18 +17056,33 @@ make_cond_for_table_from_pred(Item *root_cond, Item *cond,
     }
   }
   cond->marker=2;
+  cond->set_join_tab_idx(join_tab_idx_arg);
   return cond;
 }
 
 
+/*
+  The difference of this from make_cond_for_table() is that we're in the
+  following state:
+    1. conditions referring to 'tables' have been checked
+    2. conditions referring to sjm_tables have been checked, too
+    3. We need condition that couldn't be checked in #1 or #2 but 
+       can be checked when we get both (tables | sjm_tables).
 
+*/
 static COND *
 make_cond_after_sjm(Item *root_cond, Item *cond, table_map tables, 
                     table_map sjm_tables)
 {
+  /*
+    We assume that conditions that refer to only join prefix tables or 
+    sjm_tables have already been checked.
+  */
   if ((!(cond->used_tables() & ~tables) || 
        !(cond->used_tables() & ~sjm_tables)))
     return (COND*) 0;				// Already checked
+
+  /* AND/OR recursive descent */
   if (cond->type() == Item::COND_ITEM)
   {
     if (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC)
@@ -15242,22 +17178,61 @@ make_cond_after_sjm(Item *root_cond, Item *cond, table_map tables,
 }
 
 
+/*
+  @brief
+
+  Check if
+   - @table uses "ref"-like access 
+   - it is based on "@field=certain_item" equality
+   - the equality will be true for any record returned by the access method
+  and return the certain_item if yes.
+  
+  @detail
+  
+  Equality won't necessarily hold if:
+   - the used index covers only part of the @field. 
+     Suppose, we have a CHAR(5) field and INDEX(field(3)). if you make a lookup
+     for 'abc', you will get both record with 'abc' and with 'abcde'.
+   - The type of access is actually ref_or_null, and so @field can be either 
+     a value or NULL.
+
+  @return 
+    Item that the field will be equal to
+    NULL if no such item 
+*/
+
 static Item *
 part_of_refkey(TABLE *table,Field *field)
 {
-  if (!table->reginfo.join_tab)
+  JOIN_TAB *join_tab= table->reginfo.join_tab;
+  if (!join_tab)
     return (Item*) 0;             // field from outer non-select (UPDATE,...)
 
-  uint ref_parts=table->reginfo.join_tab->ref.key_parts;
-  if (ref_parts)
+  uint ref_parts= join_tab->ref.key_parts;
+  if (ref_parts) /* if it's ref/eq_ref/ref_or_null */
   {
-    KEY_PART_INFO *key_part=
-      table->key_info[table->reginfo.join_tab->ref.key].key_part;
+    uint key= join_tab->ref.key;
+    KEY *key_info= join_tab->get_keyinfo_by_key_no(key);
+    KEY_PART_INFO *key_part= key_info->key_part;
 
     for (uint part=0 ; part < ref_parts ; part++,key_part++)
-      if (field->eq(key_part->field) &&
-	  !(key_part->key_part_flag & (HA_PART_KEY_SEG | HA_NULL_PART)))
-	return table->reginfo.join_tab->ref.items[part];
+    {
+      if (field->eq(key_part->field))
+      {
+        /*
+          Found the field in the key. Check that 
+           1. ref_or_null doesn't alternate this component between a value and
+              a NULL
+           2. index fully covers the key
+        */
+        if (part != join_tab->ref.null_ref_part &&            // (1)
+            !(key_part->key_part_flag & HA_PART_KEY_SEG))     // (2)
+        {
+          return join_tab->ref.items[part];
+        }
+        break;
+      }
+    }
   }
   return (Item*) 0;
 }
@@ -15519,8 +17494,6 @@ static bool
 list_contains_unique_index(TABLE *table,
                           bool (*find_func) (Field *, void *), void *data)
 {
-  if (table->pos_in_table_list->outer_join)
-    return 0;
   for (uint keynr= 0; keynr < table->s->keys; keynr++)
   {
     if (keynr == table->s->primary_key ||
@@ -15534,7 +17507,7 @@ list_contains_unique_index(TABLE *table,
            key_part < key_part_end;
            key_part++)
       {
-        if (key_part->field->real_maybe_null() || 
+        if (key_part->field->maybe_null() ||
             !find_func(key_part->field, data))
           break;
       }
@@ -15646,8 +17619,10 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
   SQL_SELECT *select=tab->select;
   key_map usable_keys;
   QUICK_SELECT_I *save_quick= select ? select->quick : 0;
-  COND *orig_select_cond= 0;
+  Item *orig_cond= 0;
+  bool orig_cond_saved= false;
   int best_key= -1;
+  bool changed_key= false;
   DBUG_ENTER("test_if_skip_sort_order");
 
   /*
@@ -15666,7 +17641,7 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     }
     usable_keys.intersect(((Item_field*) item)->field->part_of_sortkey);
     if (usable_keys.is_clear_all())
-      goto use_filesort;					// No usable keys
+      goto use_filesort;                        // No usable keys
   }
 
   ref_key= -1;
@@ -15687,7 +17662,8 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       by clustered PK values.
     */
   
-    if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE || 
+    if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
+        quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
         quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT)
       goto use_filesort;
@@ -15713,14 +17689,14 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
       if (table->covering_keys.is_set(ref_key))
 	usable_keys.intersect(table->covering_keys);
       if (tab->pre_idx_push_select_cond)
-        orig_select_cond= tab->set_cond(tab->pre_idx_push_select_cond);
+      {
+        orig_cond= tab->set_cond(tab->pre_idx_push_select_cond);
+        orig_cond_saved= true;
+      }
 
       if ((new_ref_key= test_if_subkey(order, table, ref_key, ref_key_parts,
 				       &usable_keys)) < MAX_KEY)
       {
-	/* Found key that can be used to retrieve data in sorted order */
-        //psergey-mrr:if (tab->pre_idx_push_select_cond)
-        //  tab->select_cond= tab->select->cond= tab->pre_idx_push_select_cond;
 	if (tab->ref.key >= 0)
 	{
           /*
@@ -15747,26 +17723,41 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
             The range optimizer constructed QUICK_RANGE for ref_key, and
             we want to use instead new_ref_key as the index. We can't
             just change the index of the quick select, because this may
-            result in an incosistent QUICK_SELECT object. Below we
+            result in an inconsistent QUICK_SELECT object. Below we
             create a new QUICK_SELECT from scratch so that all its
-            parameres are set correctly by the range optimizer.
+            parameters are set correctly by the range optimizer.
            */
           key_map new_ref_key_map;
+          COND *save_cond;
+          bool res;
           new_ref_key_map.clear_all();  // Force the creation of quick select
           new_ref_key_map.set_bit(new_ref_key); // only for new_ref_key.
 
+          /* Reset quick;  This will be restored in 'use_filesort' if needed */
           select->quick= 0;
-          if (select->test_quick_select(tab->join->thd, new_ref_key_map, 0,
-                                        (tab->join->select_options &
-                                         OPTION_FOUND_ROWS) ?
-                                        HA_POS_ERROR :
-                                        tab->join->unit->select_limit_cnt,0,
-                                        TRUE) <=
-              0)
+          save_cond= select->cond;
+          if (select->pre_idx_push_select_cond)
+            select->cond= select->pre_idx_push_select_cond;
+          res= select->test_quick_select(tab->join->thd, new_ref_key_map, 0,
+                                         (tab->join->select_options &
+                                          OPTION_FOUND_ROWS) ?
+                                         HA_POS_ERROR :
+                                         tab->join->unit->select_limit_cnt,0,
+                                         TRUE) <= 0;
+          if (res)
+          {
+            select->cond= save_cond;
             goto use_filesort;
+          }
+          /*
+            We don't restore select->cond as we want to use the
+            original condition as index condition pushdown is not
+            active for the new index.
+          */
 	}
         ref_key= new_ref_key;
-      }
+        changed_key= true;
+     }
     }
     /* Check if we get the rows in requested sorted order by using the key */
     if (usable_keys.is_set(ref_key) &&
@@ -15790,48 +17781,45 @@ test_if_skip_sort_order(JOIN_TAB *tab,ORDER *order,ha_rows select_limit,
     /*
       filesort() and join cache are usually faster than reading in 
       index order and not using join cache, except in case that chosen
-      index is clustered primary key.
+      index is clustered key.
     */
-    if ((select_limit >= table_records) &&
-        (tab->type == JT_ALL &&
-         tab->join->tables > tab->join->const_tables + 1) &&
-         ((unsigned) best_key != table->s->primary_key ||
-          !table->file->primary_key_is_clustered()))
+    if (best_key < 0 ||
+        ((select_limit >= table_records) &&
+         (tab->type == JT_ALL &&
+         tab->join->table_count > tab->join->const_tables + 1) &&
+         !(table->file->index_flags(best_key, 0, 1) & HA_CLUSTERED_INDEX)))
       goto use_filesort;
 
-    if (best_key >= 0)
+    if (table->quick_keys.is_set(best_key) && best_key != ref_key)
     {
-      if (table->quick_keys.is_set(best_key) && best_key != ref_key)
-      {
-        key_map map;
-        map.clear_all();       // Force the creation of quick select
-        map.set_bit(best_key); // only best_key.
-        select->quick= 0;
-        select->test_quick_select(join->thd, map, 0,
-                                  join->select_options & OPTION_FOUND_ROWS ?
-                                  HA_POS_ERROR :
-                                  join->unit->select_limit_cnt,
-                                  TRUE, FALSE);
-      }
-      order_direction= best_key_direction;
-      /*
-        saved_best_key_parts is actual number of used keyparts found by the
-        test_if_order_by_key function. It could differ from keyinfo->key_parts,
-        thus we have to restore it in case of desc order as it affects
-        QUICK_SELECT_DESC behaviour.
-      */
-      used_key_parts= (order_direction == -1) ?
-        saved_best_key_parts :  best_key_parts;
+      key_map map;
+      map.clear_all();       // Force the creation of quick select
+      map.set_bit(best_key); // only best_key.
+      select->quick= 0;
+      select->test_quick_select(join->thd, map, 0,
+                                join->select_options & OPTION_FOUND_ROWS ?
+                                HA_POS_ERROR :
+                                join->unit->select_limit_cnt,
+                                TRUE, FALSE);
     }
-    else
-      goto use_filesort;
-  } 
+    order_direction= best_key_direction;
+    /*
+      saved_best_key_parts is actual number of used keyparts found by the
+      test_if_order_by_key function. It could differ from keyinfo->key_parts,
+      thus we have to restore it in case of desc order as it affects
+      QUICK_SELECT_DESC behaviour.
+    */
+    used_key_parts= (order_direction == -1) ?
+      saved_best_key_parts :  best_key_parts;
+    changed_key= true;
+  }
 
 check_reverse_order:                  
   DBUG_ASSERT(order_direction != 0);
 
   if (order_direction == -1)		// If ORDER BY ... DESC
   {
+    int quick_type;
     if (select && select->quick)
     {
       /*
@@ -15840,25 +17828,23 @@ check_reverse_order:
       */
       if (select->quick->reverse_sorted())
         goto skipped_filesort;
-      else
+
+      quick_type= select->quick->get_type();
+      if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
+          quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT ||
+          quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
+          quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
+          quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
       {
-        int quick_type= select->quick->get_type();
-        if (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE ||
-            quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
-            quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION ||
-            quick_type == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX)
-        {
-          tab->limit= 0;
-          goto use_filesort;               // Use filesort
-        }
+        tab->limit= 0;
+        goto use_filesort;               // Use filesort
       }
     }
   }
 
   /*
-    Update query plan with access pattern for doing 
-    ordered access according to what we have decided
-    above.
+    Update query plan with access pattern for doing ordered access
+    according to what we have decided above.
   */
   if (!no_changes) // We are allowed to update QEP
   {
@@ -15887,22 +17873,20 @@ check_reverse_order:
           table->enable_keyread();
         if (tab->pre_idx_push_select_cond)
         {
-          COND *tmp_cond= tab->pre_idx_push_select_cond;
-          if (orig_select_cond)
-          {
-            tmp_cond= and_conds(tmp_cond, orig_select_cond);
-            tmp_cond->quick_fix_field();
-          }
-          tab->set_cond(tmp_cond);
-          /* orig_select_cond was merged, no need to restore original one. */
-          orig_select_cond= 0;
+          tab->set_cond(tab->pre_idx_push_select_cond);
+          /*
+            orig_cond is a part of pre_idx_push_cond,
+            no need to restore it.
+          */
+          orig_cond= 0;
+          orig_cond_saved= false;
         }
         table->file->ha_index_or_rnd_end();
         if (tab->join->select_options & SELECT_DESCRIBE)
         {
           tab->ref.key= -1;
           tab->ref.key_parts= 0;
-          if (select_limit < table->file->stats.records) 
+          if (select_limit < table->file->stats.records)
             tab->limit= select_limit;
         }
       }
@@ -15921,6 +17905,14 @@ check_reverse_order:
         tab->read_first_record= join_init_read_record;
         if (tab->is_using_loose_index_scan())
           tab->join->tmp_table_param.precomputed_group_by= TRUE;
+
+        /*
+          Restore the original condition as changes done by pushdown
+          condition are not relevant anymore
+        */
+        if (tab->select && tab->select->pre_idx_push_select_cond)
+          tab->set_cond(tab->select->pre_idx_push_select_cond);
+
         /*
           TODO: update the number of records in join->best_positions[tablenr]
         */
@@ -15973,8 +17965,14 @@ skipped_filesort:
     delete save_quick;
     save_quick= NULL;
   }
-  if (orig_select_cond)
-    tab->set_cond(orig_select_cond);
+          /*
+            orig_cond is a part of pre_idx_push_cond,
+            no need to restore it.
+          */
+          orig_cond= 0;
+          orig_cond_saved= false;
+  if (orig_cond_saved && !changed_key)
+    tab->set_cond(orig_cond);
   DBUG_RETURN(1);
 
 use_filesort:
@@ -15984,8 +17982,8 @@ use_filesort:
     delete select->quick;
     select->quick= save_quick;
   }
-  if (orig_select_cond)
-    tab->set_cond(orig_select_cond);
+  if (orig_cond_saved)
+    tab->set_cond(orig_cond);
   DBUG_RETURN(0);
 }
 
@@ -16031,12 +18029,15 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
   JOIN_TAB *tab;
   DBUG_ENTER("create_sort_index");
 
-  if (join->tables == join->const_tables)
+  if (join->table_count == join->const_tables)
     DBUG_RETURN(0);				// One row, no need to sort
   tab=    join->join_tab + join->const_tables;
   table=  tab->table;
   select= tab->select;
 
+  /* Currently ORDER BY ... LIMIT is not supported in subqueries. */
+  DBUG_ASSERT(join->group_list || !join->is_in_subquery());
+
   /*
     When there is SQL_BIG_RESULT do not sort using index for GROUP BY,
     and thus force sorting on disk unless a group min-max optimization
@@ -16096,6 +18097,8 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
       get_schema_tables_result(join, PROCESSED_BY_CREATE_SORT_INDEX))
     goto err;
 
+  if (!tab->preread_init_done && tab->preread_init())
+    goto err;
   if (table->s->tmp_table)
     table->file->info(HA_STATUS_VARIABLE);	// Get record count
   table->sort.found_records=filesort(thd, table,join->sortorder, length,
@@ -16121,6 +18124,7 @@ create_sort_index(THD *thd, JOIN *join, ORDER *order,
     select->cleanup();				// filesort did select
     tab->select= 0;
     table->quick_keys.clear_all();  // as far as we cleanup select->quick
+    table->intersect_keys.clear_all();
     table->sort.io_cache= tablesort_result_cache;
   }
   tab->set_select_cond(NULL, __LINE__);
@@ -16612,7 +18616,7 @@ find_order_in_list(THD *thd, Item **ref_pointer_array, TABLE_LIST *tables,
     order->in_field_list= 1;
     order->counter= count;
     order->counter_used= 1;
-    return FALSE;
+   return FALSE;
   }
   /* Lookup the current GROUP/ORDER field in the SELECT clause. */
   select_item= find_item_in_list(order_item, fields, &counter,
@@ -17057,8 +19061,10 @@ test_if_subpart(ORDER *a,ORDER *b)
 */
 
 static TABLE *
-get_sort_by_table(ORDER *a,ORDER *b,TABLE_LIST *tables)
+get_sort_by_table(ORDER *a,ORDER *b, List<TABLE_LIST> &tables)
 {
+  TABLE_LIST *table;
+  List_iterator<TABLE_LIST> ti(tables);
   table_map map= (table_map) 0;
   DBUG_ENTER("get_sort_by_table");
 
@@ -17076,11 +19082,11 @@ get_sort_by_table(ORDER *a,ORDER *b,TABLE_LIST *tables)
   if (!map || (map & (RAND_TABLE_BIT | OUTER_REF_TABLE_BIT)))
     DBUG_RETURN(0);
 
-  for (; !(map & tables->table->map); tables= tables->next_leaf) ;
-  if (map != tables->table->map)
+  while ((table= ti++) && !(map & table->table->map)) ;
+  if (map != table->table->map)
     DBUG_RETURN(0);				// More than one table
-  DBUG_PRINT("exit",("sort by table: %d",tables->table->tablenr));
-  DBUG_RETURN(tables->table);
+  DBUG_PRINT("exit",("sort by table: %d",table->table->tablenr));
+  DBUG_RETURN(table->table);
 }
 
 
@@ -17397,9 +19403,7 @@ setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
       }
     }
     else if ((real_pos->type() == Item::FUNC_ITEM ||
-	      real_pos->type() == Item::SUBSELECT_ITEM ||
-              (real_pos->get_cached_item() &&
-               real_pos->get_cached_item()->type() == Item::SUBSELECT_ITEM) ||
+	      real_pos->real_type() == Item::SUBSELECT_ITEM ||
 	      real_pos->type() == Item::CACHE_ITEM ||
 	      real_pos->type() == Item::COND_ITEM) &&
 	     !real_pos->with_sum_func)
@@ -17642,6 +19646,7 @@ change_to_use_tmp_fields(THD *thd, Item **ref_pointer_array,
 	  char buff[256];
 	  String str(buff,sizeof(buff),&my_charset_bin);
 	  str.length(0);
+          str.extra_allocation(1024);
 	  item->print(&str, QT_ORDINARY);
 	  item_field->name= sql_strmake(str.ptr(),str.length());
 	}
@@ -17888,6 +19893,13 @@ static bool add_ref_to_table_cond(THD *thd, JOIN_TAB *join_tab)
     if (join_tab->select->cond)
       error=(int) cond->add(join_tab->select->cond);
     join_tab->select->cond= cond;
+    if (join_tab->select->pre_idx_push_select_cond)
+    {
+      Item *new_cond= and_conds(join_tab->select->pre_idx_push_select_cond, cond);
+      if (!new_cond->fixed && new_cond->fix_fields(thd, &new_cond))
+        error= 1;
+      join_tab->select->pre_idx_push_select_cond= new_cond;
+    }
     join_tab->set_select_cond(cond, __LINE__);
   }
   else if ((join_tab->select= make_select(join_tab->table, 0, 0, cond, 0,
@@ -17995,6 +20007,7 @@ static bool change_group_ref(THD *thd, Item_func *expr, ORDER *group_list,
     if (arg_changed)
     {
       expr->maybe_null= 1;
+      expr->in_rollup= 1;
       *changed= TRUE;
     }
   }
@@ -18058,6 +20071,7 @@ bool JOIN::rollup_init()
       if (*group_tmp->item == item)
       {
         item->maybe_null= 1;
+        item->in_rollup= 1;
         found_in_group= 1;
         break;
       }
@@ -18288,6 +20302,7 @@ int JOIN::rollup_send_data(uint idx)
   uint i;
   for (i= send_group_parts ; i-- > idx ; )
   {
+    int res= 0;
     /* Get reference pointers to sum functions in place */
     memcpy((char*) ref_pointer_array,
 	   (char*) rollup.ref_pointer_arrays[i],
@@ -18295,9 +20310,10 @@ int JOIN::rollup_send_data(uint idx)
     if ((!having || having->val_int()))
     {
       if (send_records < unit->select_limit_cnt && do_send_rows &&
-	  result->send_data(rollup.fields[i]))
+	  (res= result->send_data(rollup.fields[i])) > 0)
 	return 1;
-      send_records++;
+      if (!res)
+        send_records++;
     }
   }
   /* Restore ref_pointer_array */
@@ -18345,7 +20361,7 @@ int JOIN::rollup_write_data(uint idx, TABLE *table_arg)
           item->save_in_result_field(1);
       }
       copy_sum_funcs(sum_funcs_end[i+1], sum_funcs_end[i]);
-      if ((write_error= table_arg->file->ha_write_row(table_arg->record[0])))
+      if ((write_error= table_arg->file->ha_write_tmp_row(table_arg->record[0])))
       {
 	if (create_internal_tmp_table_from_heap(thd, table_arg, 
                                                 tmp_table_param.start_recinfo,
@@ -18493,24 +20509,28 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
     if (result->send_data(item_list))
       join->error= 1;
   }
-  else
+  else if (!join->select_lex->master_unit()->derived ||
+           join->select_lex->master_unit()->derived->is_materialized_derived())
   {
     table_map used_tables=0;
 
-    uchar sjm_nests[MAX_TABLES];
-    uint sjm_nests_cur=0;
-    uint sjm_nests_end= 0;
-    uint end_table= join->tables;
     bool printing_materialize_nest= FALSE;
     uint select_id= join->select_lex->select_number;
 
-    for (uint i=0 ; i < end_table ; i++)
+    for (JOIN_TAB *tab= first_breadth_first_tab(join); tab;
+         tab= next_breadth_first_tab(join, tab))
     {
-      JOIN_TAB *tab=join->join_tab+i;
+      if (tab->bush_root_tab)
+      {
+        JOIN_TAB *first_sibling= tab->bush_root_tab->bush_children->start;
+        select_id= first_sibling->emb_sj_nest->sj_subq_pred->get_identifier();
+        printing_materialize_nest= TRUE;
+      }
+
       TABLE *table=tab->table;
       TABLE_LIST *table_list= tab->table->pos_in_table_list;
       char buff[512]; 
-      char buff1[512], buff2[512], buff3[512];
+      char buff1[512], buff2[512], buff3[512], buff4[512];
       char keylen_str_buf[64];
       my_bool key_read;
       String extra(buff, sizeof(buff),cs);
@@ -18518,10 +20538,17 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
       String tmp1(buff1,sizeof(buff1),cs);
       String tmp2(buff2,sizeof(buff2),cs);
       String tmp3(buff3,sizeof(buff3),cs);
+      String tmp4(buff4,sizeof(buff4),cs);
+      char hash_key_prefix[]= "#hash#";
+      KEY *key_info= 0;
+      uint key_len= 0;
+      bool is_hj= tab->type == JT_HASH || tab->type ==JT_HASH_NEXT;
+
       extra.length(0);
       tmp1.length(0);
       tmp2.length(0);
       tmp3.length(0);
+      tmp4.length(0);
       quick_type= -1;
 
       /* Don't show eliminated tables */
@@ -18539,93 +20566,17 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
                                                     join->select_lex->type;
       item_list.push_back(new Item_string(stype, strlen(stype), cs));
       
-      /* 
-        Special processing for SJ-Materialization nests: print the fake table
-        and delay printing of the SJM nest contents until later.
-      */
-      uint sj_strategy= join->best_positions[i].sj_strategy;
-      if (sj_is_materialize_strategy(sj_strategy) &&
-          !printing_materialize_nest)
-      {
-        /* table */
-        int len= my_snprintf(table_name_buffer, 
-                             sizeof(table_name_buffer)-1,
-                             "subselect%d", 
-                             tab->emb_sj_nest->sj_subq_pred->get_identifier());
-	item_list.push_back(new Item_string(table_name_buffer, len, cs));
-        /* partitions */
-        if (join->thd->lex->describe & DESCRIBE_PARTITIONS)
-          item_list.push_back(item_null);
-        /* type */
-        uint type= (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)? JT_ALL : JT_EQ_REF;
-        item_list.push_back(new Item_string(join_type_str[type],
-                                            strlen(join_type_str[type]),
-                                            cs));
-        /* possible_keys */
-	item_list.push_back(new Item_string("unique_key", 
-                                            strlen("unique_key"), cs));
-        if (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
-        {
-          item_list.push_back(item_null); /* key */
-          item_list.push_back(item_null); /* key_len */
-          item_list.push_back(item_null); /* ref */
-        }
-        else
-        {
-          /* key */
-          item_list.push_back(new Item_string("unique_key", strlen("unique_key"), cs));
-          /* key_len */
-          uint klen= tab->emb_sj_nest->sj_mat_info->table->key_info[0].key_length;
-          uint buflen= longlong10_to_str(klen, keylen_str_buf, 10) - keylen_str_buf;
-          item_list.push_back(new Item_string(keylen_str_buf, buflen, cs));
-          /* ref */
-          item_list.push_back(new Item_string("func", strlen("func"), cs));
-        }
-        /* rows */
-        ha_rows rows= (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)?
-                       tab->emb_sj_nest->sj_mat_info->rows : 1;
-        item_list.push_back(new Item_int((longlong)rows, 
-                                         MY_INT64_NUM_DECIMAL_DIGITS));
-        /* filtered */
-        if (join->thd->lex->describe & DESCRIBE_EXTENDED)
-          item_list.push_back(new Item_float(1.0, 2));
-        
-        /* Extra */
-	if (need_tmp_table)
-	{
-	  need_tmp_table=0;
-	  extra.append(STRING_WITH_LEN("; Using temporary"));
-	}
-	if (need_order)
-	{
-	  need_order=0;
-	  extra.append(STRING_WITH_LEN("; Using filesort"));
-	}
-        /* Skip initial "; "*/
-        const char *str= extra.ptr();
-        uint32 extra_len= extra.length();
-        if (extra_len)
-        {
-          str += 2;
-          extra_len -= 2;
-        }
-	item_list.push_back(new Item_string(str, extra_len, cs));
-
-        /* Register the nest for further processing: */
-        sjm_nests[sjm_nests_end++]= i;
-        i += join->best_positions[i].n_sj_tables-1;
-        goto loop_end;
-      }
-
-      if (tab->type == JT_ALL && tab->select && tab->select->quick)
+      if ((tab->type == JT_ALL || tab->type == JT_HASH) &&
+           tab->select && tab->select->quick)
       {
         quick_type= tab->select->quick->get_type();
         if ((quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE) ||
+            (quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT) ||
             (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT) ||
             (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION))
-          tab->type = JT_INDEX_MERGE;
+          tab->type= tab->type == JT_ALL ? JT_INDEX_MERGE : JT_HASH_INDEX_MERGE;
         else
-	  tab->type = JT_RANGE;
+	  tab->type= tab->type == JT_ALL ? JT_RANGE : JT_HASH_RANGE;
       }
 
       /* table */
@@ -18637,6 +20588,16 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 			     table->derived_select_number);
 	item_list.push_back(new Item_string(table_name_buffer, len, cs));
       }
+      else if (tab->bush_children)
+      {
+        JOIN_TAB *ctab= tab->bush_children->start;
+        /* table */
+        int len= my_snprintf(table_name_buffer, 
+                             sizeof(table_name_buffer)-1,
+                             "<subquery%d>", 
+                             ctab->emb_sj_nest->sj_subq_pred->get_identifier());
+	item_list.push_back(new Item_string(table_name_buffer, len, cs));
+      }
       else
       {
         TABLE_LIST *real_table= table->pos_in_table_list; 
@@ -18689,49 +20650,72 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 	item_list.push_back(item_null);
 
       /* Build "key", "key_len", and "ref" values and add them to item_list */
-      if (tab->ref.key_parts)
+      if (tab->type == JT_NEXT)
+      {
+	key_info= table->key_info+tab->index;
+        key_len= key_info->key_length;
+      }
+      else if (tab->ref.key_parts)
+      {
+	key_info= tab->get_keyinfo_by_key_no(tab->ref.key);
+        key_len= tab->ref.key_length;
+      }
+      if (key_info)
       {
-	KEY *key_info=table->key_info+ tab->ref.key;
         register uint length;
-	item_list.push_back(new Item_string(key_info->name,
-					    strlen(key_info->name),
-					    system_charset_info));
-        length= (longlong10_to_str(tab->ref.key_length, keylen_str_buf, 10) - 
+        if (is_hj)
+          tmp2.append(hash_key_prefix, strlen(hash_key_prefix), cs);
+        tmp2.append(key_info->name,  strlen(key_info->name), cs);
+        length= (longlong10_to_str(key_len, keylen_str_buf, 10) - 
                  keylen_str_buf);
-        item_list.push_back(new Item_string(keylen_str_buf, length,
-                                            system_charset_info));
-	for (store_key **ref=tab->ref.key_copy ; *ref ; ref++)
+        tmp3.append(keylen_str_buf, length, cs);
+        if (tab->ref.key_parts)
 	{
-	  if (tmp2.length())
-	    tmp2.append(',');
-	  tmp2.append((*ref)->name(), strlen((*ref)->name()),
-		      system_charset_info);
-	}
-	item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
+	  for (store_key **ref=tab->ref.key_copy ; *ref ; ref++)
+	  {
+	    if (tmp4.length())
+	      tmp4.append(',');
+	    tmp4.append((*ref)->name(), strlen((*ref)->name()), cs);
+          }
+        }
       }
-      else if (tab->type == JT_NEXT)
+      if (is_hj && tab->type != JT_HASH)
+      {
+        tmp2.append(':');
+        tmp3.append(':');
+      }
+      if (tab->type == JT_HASH_NEXT)
       {
-	KEY *key_info=table->key_info+ tab->index;
         register uint length;
-	item_list.push_back(new Item_string(key_info->name,
-					    strlen(key_info->name),cs));
-        length= (longlong10_to_str(key_info->key_length, keylen_str_buf, 10) - 
+	key_info= table->key_info+tab->index;
+        key_len= key_info->key_length;
+        tmp2.append(key_info->name,  strlen(key_info->name), cs);
+        length= (longlong10_to_str(key_len, keylen_str_buf, 10) - 
                  keylen_str_buf);
-        item_list.push_back(new Item_string(keylen_str_buf, 
-                                            length,
-                                            system_charset_info));
-	item_list.push_back(item_null);
+        tmp3.append(keylen_str_buf, length, cs);
       }
-      else if (tab->select && tab->select->quick)
-      {
+      if ((is_hj || tab->type==JT_RANGE || tab->type == JT_INDEX_MERGE) &&
+          tab->select && tab->select->quick)
         tab->select->quick->add_keys_and_lengths(&tmp2, &tmp3);
-	item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
-	item_list.push_back(new Item_string(tmp3.ptr(),tmp3.length(),cs));
-	item_list.push_back(item_null);
+      if (key_info || (tab->select && tab->select->quick))
+      {
+        if (tmp2.length())
+          item_list.push_back(new Item_string(tmp2.ptr(),tmp2.length(),cs));
+        else
+          item_list.push_back(item_null);
+        if (tmp3.length())
+          item_list.push_back(new Item_string(tmp3.ptr(),tmp3.length(),cs));
+        else
+          item_list.push_back(item_null);
+        if (key_info && tab->type != JT_NEXT)
+          item_list.push_back(new Item_string(tmp4.ptr(),tmp4.length(),cs));
+        else
+          item_list.push_back(item_null);
       }
       else
       {
-        if (table_list->schema_table &&
+        if (table_list && /* SJM bushes don't have table_list */
+            table_list->schema_table &&
             table_list->schema_table->i_s_requested_object & OPTIMIZE_I_S_TABLE)
         {
           const char *tmp_buff;
@@ -18762,7 +20746,8 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
       }
       
       /* Add "rows" field to item_list. */
-      if (table_list->schema_table)
+      if (table_list /* SJM bushes don't have table_list */ &&
+          table_list->schema_table)
       {
         /* in_rows */
         if (join->thd->lex->describe & DESCRIBE_EXTENDED)
@@ -18775,18 +20760,28 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         ha_rows examined_rows;
         if (tab->select && tab->select->quick)
           examined_rows= tab->select->quick->records;
-        else if (tab->type == JT_NEXT || tab->type == JT_ALL)
+        else if (tab->type == JT_NEXT || tab->type == JT_ALL || is_hj)
         {
           if (tab->limit)
             examined_rows= tab->limit;
           else
           {
-            tab->table->file->info(HA_STATUS_VARIABLE);
-            examined_rows= tab->table->file->stats.records;
+            if (tab->table->is_filled_at_execution())
+            {
+              examined_rows= tab->records;
+            }
+            else
+            {
+              /*
+                handler->info(HA_STATUS_VARIABLE) has been called in
+                make_join_statistics()
+              */
+              examined_rows= tab->table->file->stats.records;
+            }
           }
         }
         else
-          examined_rows=(ha_rows)join->best_positions[i].records_read; 
+          examined_rows=(ha_rows)tab->records_read; 
  
         item_list.push_back(new Item_int((longlong) (ulonglong) examined_rows, 
                                          MY_INT64_NUM_DECIMAL_DIGITS));
@@ -18794,21 +20789,10 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         /* Add "filtered" field to item_list. */
         if (join->thd->lex->describe & DESCRIBE_EXTENDED)
         {
-          /*
-            psergey-todo: 
-              in the code above, we cast to integer when asssigning to
-              examined_rows. 
-              In the code below, we may divide original value but result of
-              conversion of the same value to integer, which may produce a
-              value that's greater than 100%, which looks very odd.
-              I'm not fixing this right away because that might trigger a wave
-              of small EXPLAIN EXTENDED output changes, which I don't have time
-              to deal with right now.
-          */
           float f= 0.0; 
           if (examined_rows)
-            f= (float) (100.0 * join->best_positions[i].records_read /
-                        examined_rows);
+            f= (float) (100.0 * tab->records_read / examined_rows);
+ 	  set_if_smaller(f, 100.0);
           item_list.push_back(new Item_float(f, 2));
         }
       }
@@ -18858,6 +20842,7 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 
         if (quick_type == QUICK_SELECT_I::QS_TYPE_ROR_UNION || 
             quick_type == QUICK_SELECT_I::QS_TYPE_ROR_INTERSECT ||
+            quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_INTERSECT ||
             quick_type == QUICK_SELECT_I::QS_TYPE_INDEX_MERGE)
         {
           extra.append(STRING_WITH_LEN("; Using "));
@@ -18893,7 +20878,8 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
               extra.append(STRING_WITH_LEN("; Using where"));
           }
 	}
-        if (table_list->schema_table &&
+        if (table_list /* SJM bushes don't have table_list */ &&
+            table_list->schema_table &&
             table_list->schema_table->i_s_requested_object & OPTIMIZE_I_S_TABLE)
         {
           if (!table_list->table_open_method)
@@ -18926,12 +20912,30 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 	if (table->reginfo.not_exists_optimize)
 	  extra.append(STRING_WITH_LEN("; Not exists"));
 
+        /*
         if (quick_type == QUICK_SELECT_I::QS_TYPE_RANGE &&
             !(((QUICK_RANGE_SELECT*)(tab->select->quick))->mrr_flags &
              HA_MRR_USE_DEFAULT_IMPL))
         {
 	  extra.append(STRING_WITH_LEN("; Using MRR"));
         }
+        */
+        if (quick_type == QUICK_SELECT_I::QS_TYPE_RANGE)
+        {
+          char mrr_str_buf[128];
+          mrr_str_buf[0]=0;
+          int len;
+          uint mrr_flags= 
+            ((QUICK_RANGE_SELECT*)(tab->select->quick))->mrr_flags;
+          len= table->file->multi_range_read_explain_info(mrr_flags,
+                                                          mrr_str_buf,
+                                                          sizeof(mrr_str_buf));
+          if (len > 0)
+          {
+            extra.append(STRING_WITH_LEN("; "));
+            extra.append(mrr_str_buf, len);
+          }
+        }
 
 	if (need_tmp_table)
 	{
@@ -18943,7 +20947,8 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
 	  need_order=0;
 	  extra.append(STRING_WITH_LEN("; Using filesort"));
 	}
-	if (distinct & test_all_bits(used_tables,thd->used_tables))
+	if (distinct & test_all_bits(used_tables,
+                                     join->select_list_used_tables))
 	  extra.append(STRING_WITH_LEN("; Distinct"));
         if (tab->loosescan_match_tab)
         {
@@ -18977,25 +20982,6 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
           }
         }
 
-        /*
-        if (sj_is_materialize_strategy(sj_strategy))
-        {
-          if (join->best_positions[i].n_sj_tables == 1)
-            extra.append(STRING_WITH_LEN("; Materialize"));
-          else
-          {
-            last_sjm_table= i + join->best_positions[i].n_sj_tables - 1;
-            extra.append(STRING_WITH_LEN("; Start materialize"));
-          }
-          if (sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
-              extra.append(STRING_WITH_LEN("; Scan"));
-        }
-        else if (last_sjm_table == i)
-        {
-          extra.append(STRING_WITH_LEN("; End materialize"));
-        }
-        */
-
         for (uint part= 0; part < tab->ref.key_parts; part++)
         {
           if (tab->ref.cond_guards[part])
@@ -19005,8 +20991,11 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
           }
         }
 
-        if (i > 0 && tab[-1].next_select == sub_select_cache)
+        if (tab->cache)
+	{
           extra.append(STRING_WITH_LEN("; Using join buffer"));
+          tab->cache->print_explain_comment(&extra);
+        }
         
         /* Skip initial "; "*/
         const char *str= extra.ptr();
@@ -19018,14 +21007,6 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
         }
 	item_list.push_back(new Item_string(str, len, cs));
       }
-    loop_end:
-       if (i+1 == end_table && sjm_nests_cur != sjm_nests_end)
-       {
-         printing_materialize_nest= TRUE;
-         i= sjm_nests[sjm_nests_cur++] - 1;
-         end_table= (i+1) + join->best_positions[i+1].n_sj_tables;
-         select_id= join->join_tab[i+1].emb_sj_nest->sj_subq_pred->get_identifier();
-       }
       
       // For next iteration
       used_tables|=table->map;
@@ -19073,28 +21054,12 @@ bool mysql_explain_union(THD *thd, SELECT_LEX_UNIT *unit, select_result *result)
   bool res= 0;
   SELECT_LEX *first= unit->first_select();
 
-  for (SELECT_LEX *sl= first;
-       sl;
-       sl= sl->next_select())
-  {
-    // drop UNCACHEABLE_EXPLAIN, because it is for internal usage only
-    uint8 uncacheable= (sl->uncacheable & ~UNCACHEABLE_EXPLAIN);
-    sl->type= (((&thd->lex->select_lex)==sl)?
-	       (sl->first_inner_unit() || sl->next_select() ? 
-		"PRIMARY" : "SIMPLE"):
-	       ((sl == first)?
-		((sl->linkage == DERIVED_TABLE_TYPE) ?
-		 "DERIVED":
-		 ((uncacheable & UNCACHEABLE_DEPENDENT) ?
-		  "DEPENDENT SUBQUERY":
-		  (uncacheable?"UNCACHEABLE SUBQUERY":
-		   "SUBQUERY"))):
-		((uncacheable & UNCACHEABLE_DEPENDENT) ?
-		 "DEPENDENT UNION":
-		 uncacheable?"UNCACHEABLE UNION":
-		 "UNION")));
+  for (SELECT_LEX *sl= first; sl; sl= sl->next_select())
+  {
+    sl->set_explain_type();
     sl->options|= SELECT_DESCRIBE;
   }
+
   if (unit->is_union())
   {
     unit->fake_select_lex->select_number= UINT_MAX; // jost for initialization
@@ -19296,6 +21261,14 @@ void TABLE_LIST::print(THD *thd, table_map eliminated_tables, String *str,
     print_join(thd, eliminated_tables, str, &nested_join->join_list, query_type);
     str->append(')');
   }
+  else if (jtbm_subselect)
+  {
+    str->append(STRING_WITH_LEN(" <materialize> ("));
+    subselect_hash_sj_engine *hash_engine;
+    hash_engine= (subselect_hash_sj_engine*)jtbm_subselect->engine;
+    hash_engine->materialize_engine->print(str, query_type);
+    str->append(')');
+  }
   else
   {
     const char *cmp_name;                         // Name to compare with alias
@@ -19378,9 +21351,7 @@ void TABLE_LIST::print(THD *thd, table_map eliminated_tables, String *str,
 
 void st_select_lex::print(THD *thd, String *str, enum_query_type query_type)
 {
-  /* QQ: thd may not be set for sub queries, but this should be fixed */
-  if (!thd)
-    thd= current_thd;
+  DBUG_ASSERT(thd);
 
   str->append(STRING_WITH_LEN("select "));
 
@@ -19528,6 +21499,8 @@ bool JOIN::change_result(select_result *res)
 {
   DBUG_ENTER("JOIN::change_result");
   result= res;
+  if (tmp_join)
+    tmp_join->result= res;
   if (!procedure && (result->prepare(fields_list, select_lex->master_unit()) ||
                      result->prepare2()))
   {
@@ -19536,6 +21509,189 @@ bool JOIN::change_result(select_result *res)
   DBUG_RETURN(FALSE);
 }
 
+
+/**
+  @brief
+  Set allowed types of join caches that can be used for join operations
+
+  @details
+  The function sets a bitmap of allowed join buffers types in the field
+  allowed_join_cache_types of this JOIN structure:
+    bit 1 is set if tjoin buffers are allowed to be incremental
+    bit 2 is set if the join buffers are allowed to be hashed
+    but 3 is set if the join buffers are allowed to be used for BKA
+  join algorithms.
+  The allowed types are read from system variables.
+  Besides the function sets maximum allowed join cache level that is
+  also read from a system variable.
+*/
+
+void JOIN::set_allowed_join_cache_types()
+{
+  allowed_join_cache_types= 0;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_INCREMENTAL))
+    allowed_join_cache_types|= JOIN_CACHE_INCREMENTAL_BIT;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_HASHED))
+    allowed_join_cache_types|= JOIN_CACHE_HASHED_BIT;
+  if (optimizer_flag(thd, OPTIMIZER_SWITCH_JOIN_CACHE_BKA))
+    allowed_join_cache_types|= JOIN_CACHE_BKA_BIT;
+  allowed_semijoin_with_cache=
+    optimizer_flag(thd, OPTIMIZER_SWITCH_SEMIJOIN_WITH_CACHE);
+  allowed_outer_join_with_cache=
+    optimizer_flag(thd, OPTIMIZER_SWITCH_OUTER_JOIN_WITH_CACHE);
+  max_allowed_join_cache_level= thd->variables.join_cache_level;
+}
+
+
+/**
+  Save a query execution plan so that the caller can revert to it if needed,
+  and reset the current query plan so that it can be reoptimized.
+
+  @param save_to  The object into which the current query plan state is saved
+*/
+
+void JOIN::save_query_plan(Join_plan_state *save_to)
+{
+  if (keyuse.elements)
+  {
+    DYNAMIC_ARRAY tmp_keyuse;
+    /* Swap the current and the backup keyuse internal arrays. */
+    tmp_keyuse= keyuse;
+    keyuse= save_to->keyuse; /* keyuse is reset to an empty array. */
+    save_to->keyuse= tmp_keyuse;
+
+    for (uint i= 0; i < table_count; i++)
+    {
+      save_to->join_tab_keyuse[i]= join_tab[i].keyuse;
+      join_tab[i].keyuse= NULL;
+      save_to->join_tab_checked_keys[i]= join_tab[i].checked_keys;
+      join_tab[i].checked_keys.clear_all();
+    }
+  }
+  memcpy((uchar*) save_to->best_positions, (uchar*) best_positions,
+         sizeof(POSITION) * (table_count + 1));
+  memset(best_positions, 0, sizeof(POSITION) * (table_count + 1));
+}
+
+
+/**
+  Restore a query execution plan previously saved by the caller.
+
+  @param The object from which the current query plan state is restored.
+*/
+
+void JOIN::restore_query_plan(Join_plan_state *restore_from)
+{
+  if (restore_from->keyuse.elements)
+  {
+    DYNAMIC_ARRAY tmp_keyuse;
+    tmp_keyuse= keyuse;
+    keyuse= restore_from->keyuse;
+    restore_from->keyuse= tmp_keyuse;
+
+    for (uint i= 0; i < table_count; i++)
+    {
+      join_tab[i].keyuse= restore_from->join_tab_keyuse[i];
+      join_tab[i].checked_keys= restore_from->join_tab_checked_keys[i];
+    }
+
+  }
+  memcpy((uchar*) best_positions, (uchar*) restore_from->best_positions,
+         sizeof(POSITION) * (table_count + 1));
+}
+
+
+/**
+  Reoptimize a query plan taking into account an additional conjunct to the
+  WHERE clause.
+
+  @param added_where  An extra conjunct to the WHERE clause to reoptimize with
+  @param join_tables  The set of tables to reoptimize
+  @param save_to      If != NULL, save here the state of the current query plan
+
+  @notes
+  Given a query plan that was already optimized taking into account some WHERE
+  clause 'C', reoptimize this plan with a new WHERE clause 'C AND added_where'.
+  The reoptimization works as follows:
+
+  1. Call update_ref_and_keys *only* for the new conditions 'added_where'
+     that are about to be injected into the query.
+  2. Expand if necessary the original KEYUSE array JOIN::keyuse to
+     accommodate the new REF accesses computed for the 'added_where' condition.
+  3. Add the new KEYUSEs into JOIN::keyuse.
+  4. Re-sort and re-filter the JOIN::keyuse array with the newly added
+     KEYUSE elements. 
+ 
+  @retval REOPT_NEW_PLAN  there is a new plan.
+  @retval REOPT_OLD_PLAN  no new improved plan was produced, use the old one.
+  @retval REOPT_ERROR     an irrecovarable error occured during reoptimization.
+*/
+
+JOIN::enum_reopt_result
+JOIN::reoptimize(Item *added_where, table_map join_tables,
+                 Join_plan_state *save_to)
+{
+  DYNAMIC_ARRAY added_keyuse;
+  SARGABLE_PARAM *sargables= 0; /* Used only as a dummy parameter. */
+  uint org_keyuse_elements;
+
+  /* Re-run the REF optimizer to take into account the new conditions. */
+  if (update_ref_and_keys(thd, &added_keyuse, join_tab, table_count, added_where,
+                          ~outer_join, select_lex, &sargables))
+  {
+    delete_dynamic(&added_keyuse);
+    return REOPT_ERROR;
+  }
+
+  if (!added_keyuse.elements)
+  {
+    delete_dynamic(&added_keyuse);
+    return REOPT_OLD_PLAN;
+  }
+
+  if (save_to)
+    save_query_plan(save_to);
+
+  if (!keyuse.buffer &&
+      my_init_dynamic_array(&keyuse, sizeof(KEYUSE), 20, 64))
+  {
+    delete_dynamic(&added_keyuse);
+    return REOPT_ERROR;
+  }
+
+  org_keyuse_elements= save_to ? save_to->keyuse.elements : keyuse.elements;
+  allocate_dynamic(&keyuse, org_keyuse_elements + added_keyuse.elements);
+
+  /* If needed, add the access methods from the original query plan. */
+  if (save_to)
+  {
+    DBUG_ASSERT(!keyuse.elements);
+    memcpy(keyuse.buffer,
+           save_to->keyuse.buffer,
+           (size_t) save_to->keyuse.elements * keyuse.size_of_element);
+    keyuse.elements= save_to->keyuse.elements;
+  }
+
+  /* Add the new access methods to the keyuse array. */
+  memcpy(keyuse.buffer + keyuse.elements * keyuse.size_of_element,
+         added_keyuse.buffer,
+         (size_t) added_keyuse.elements * added_keyuse.size_of_element);
+  keyuse.elements+= added_keyuse.elements;
+  /* added_keyuse contents is copied, and it is no longer needed. */
+  delete_dynamic(&added_keyuse);
+
+  if (sort_and_filter_keyuse(thd, &keyuse, true))
+    return REOPT_ERROR;
+  optimize_keyuse(this, &keyuse);
+
+  /* Re-run the join optimizer to compute a new query plan. */
+  if (choose_plan(this, join_tables))
+    return REOPT_ERROR;
+
+  return REOPT_NEW_PLAN;
+}
+
+
 /**
   Cache constant expressions in WHERE, HAVING, ON conditions.
 */
@@ -19546,7 +21702,7 @@ void JOIN::cache_const_exprs()
   bool *analyzer_arg= &cache_flag;
 
   /* No need in cache if all tables are constant. */
-  if (const_tables == tables)
+  if (const_tables == table_count)
     return;
 
   if (conds)
@@ -19557,7 +21713,8 @@ void JOIN::cache_const_exprs()
     having->compile(&Item::cache_const_expr_analyzer, (uchar **)&analyzer_arg,
                     &Item::cache_const_expr_transformer, (uchar *)&cache_flag);
 
-  for (JOIN_TAB *tab= join_tab + const_tables; tab < join_tab + tables ; tab++)
+  for (JOIN_TAB *tab= first_depth_first_tab(this); tab;
+       tab= next_depth_first_tab(this, tab))
   {
     if (*tab->on_expr_ref)
     {
@@ -19603,7 +21760,7 @@ void JOIN::cache_const_exprs()
 static bool
 test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
                          key_map usable_keys,  int ref_key,
-                         ha_rows select_limit,
+                         ha_rows select_limit_arg,
                          int *new_key, int *new_key_direction,
                          ha_rows *new_select_limit, uint *new_used_key_parts,
                          uint *saved_best_key_parts)
@@ -19635,7 +21792,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
     resolved with a key;  This is because filesort() is usually faster than
     retrieving all rows through an index.
   */
-  if (select_limit >= table_records)
+  if (select_limit_arg >= table_records)
   {
     keys= *table->file->keys_to_use_for_scanning();
     keys.merge(table->covering_keys);
@@ -19660,7 +21817,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
   {
     uint tablenr= tab - join->join_tab;
     read_time= join->best_positions[tablenr].read_time;
-    for (uint i= tablenr+1; i < join->tables; i++)
+    for (uint i= tablenr+1; i < join->table_count; i++)
       fanout*= join->best_positions[i].records_read; // fanout is always >= 1
   }
   else
@@ -19669,6 +21826,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
   for (nr=0; nr < table->s->keys ; nr++)
   {
     int direction;
+    ha_rows select_limit= select_limit_arg;
     uint used_key_parts;
 
     if (keys.is_set(nr) &&
@@ -19681,10 +21839,9 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
       */
       DBUG_ASSERT (ref_key != (int) nr);
 
-      bool is_covering= table->covering_keys.is_set(nr) ||
-                        (nr == table->s->primary_key &&
-                        table->file->primary_key_is_clustered());
-      
+      bool is_covering= (table->covering_keys.is_set(nr) ||
+                         (table->file->index_flags(nr, 0, 1) &
+                          HA_CLUSTERED_INDEX));
       /* 
         Don't use an index scan with ORDER BY without limit.
         For GROUP BY without limit always use index scan
@@ -19762,7 +21919,8 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
               select_limit= table_records;
           else
             select_limit= (ha_rows) (select_limit*rec_per_key);
-        }
+        } /* group */
+
         /* 
           If tab=tk is not the last joined table tn then to get first
           L records from the result set we can expect to retrieve
@@ -19806,8 +21964,7 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
         */
         index_scan_time= select_limit/rec_per_key *
                          min(rec_per_key, table->file->scan_time());
-        if ((ref_key < 0 && is_covering) || 
-            (ref_key < 0 && (group || table->force_index)) ||
+        if ((ref_key < 0 && (group || table->force_index || is_covering)) ||
             index_scan_time < read_time)
         {
           ha_rows quick_records= table_records;
@@ -19819,7 +21976,8 @@ test_if_cheaper_ordering(const JOIN_TAB *tab, ORDER *order, TABLE *table,
           if (best_key < 0 ||
               (select_limit <= min(quick_records,best_records) ?
                keyinfo->key_parts < best_key_parts :
-               quick_records < best_records))
+               quick_records < best_records) ||
+              (!is_best_covering && is_covering))
           {
             best_key= nr;
             best_key_parts= keyinfo->key_parts;
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 696cec99192..d456eab5ac5 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -1,7 +1,8 @@
 #ifndef SQL_SELECT_INCLUDED
 #define SQL_SELECT_INCLUDED
 
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
+   Copyright (c) 2010, 2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -45,6 +46,11 @@
 /* Values in optimize */
 #define KEY_OPTIMIZE_EXISTS		1
 #define KEY_OPTIMIZE_REF_OR_NULL	2
+#define KEY_OPTIMIZE_EQ	                4
+
+inline uint get_hash_join_key_no() { return MAX_KEY; }
+
+inline bool is_hash_join_key_no(uint key) { return key == MAX_KEY; }
 
 typedef struct keyuse_t {
   TABLE *table;
@@ -74,10 +80,14 @@ typedef struct keyuse_t {
      MAX_UINT  Otherwise
   */
   uint         sj_pred_no;
+
+  bool is_for_hash_join() { return is_hash_join_key_no(key); }
 } KEYUSE;
 
 class store_key;
 
+const int NO_REF_PART= uint(-1);
+
 typedef struct st_table_ref
 {
   bool		key_err;
@@ -108,8 +118,16 @@ typedef struct st_table_ref
   */
   key_part_map  null_rejecting;
   table_map	depend_map;		  ///< Table depends on these tables.
+
   /* null byte position in the key_buf. Used for REF_OR_NULL optimization */
   uchar          *null_ref_key;
+  /* 
+    ref_or_null optimization: number of key part that alternates between
+    the lookup value or NULL (there's only one such part). 
+    If we're not using ref_or_null, the value is NO_REF_PART
+  */
+  uint           null_ref_part;
+
   /*
     The number of times the record associated with this key was used
     in the join.
@@ -124,7 +142,7 @@ typedef struct st_table_ref
   bool          disable_cache;
 
   bool tmp_table_index_lookup_init(THD *thd, KEY *tmp_key, Item_iterator &it,
-                                   bool value);
+                                   bool value, uint skip= 0);
 } TABLE_REF;
 
 
@@ -133,7 +151,8 @@ typedef struct st_table_ref
 */
 enum join_type { JT_UNKNOWN,JT_SYSTEM,JT_CONST,JT_EQ_REF,JT_REF,JT_MAYBE_REF,
 		 JT_ALL, JT_RANGE, JT_NEXT, JT_FT, JT_REF_OR_NULL,
-		 JT_UNIQUE_SUBQUERY, JT_INDEX_SUBQUERY, JT_INDEX_MERGE};
+		 JT_UNIQUE_SUBQUERY, JT_INDEX_SUBQUERY, JT_INDEX_MERGE,
+                 JT_HASH, JT_HASH_RANGE, JT_HASH_NEXT, JT_HASH_INDEX_MERGE};
 
 class JOIN;
 
@@ -155,17 +174,23 @@ typedef enum_nested_loop_state
 (*Next_select_func)(JOIN *, struct st_join_table *, bool);
 Next_select_func setup_end_select_func(JOIN *join);
 int rr_sequential(READ_RECORD *info);
+int rr_sequential_and_unpack(READ_RECORD *info);
 
 
 class JOIN_CACHE;
 class SJ_TMP_TABLE;
+class JOIN_TAB_RANGE;
 
 typedef struct st_join_table {
   st_join_table() {}                          /* Remove gcc warning */
   TABLE		*table;
   KEYUSE	*keyuse;			/**< pointer to first used key */
+  KEY           *hj_key;       /**< descriptor of the used best hash join key
+				    not supported by any index                 */
   SQL_SELECT	*select;
-  COND          *select_cond;
+  COND		*select_cond;
+  COND          *on_precond;    /**< part of on condition to check before
+				     accessing the first inner table           */  
   QUICK_SELECT_I *quick;
   /* 
     The value of select_cond before we've attempted to do Index Condition
@@ -175,7 +200,13 @@ typedef struct st_join_table {
     NULL means no index condition pushdown was performed.
   */
   Item          *pre_idx_push_select_cond;
-  Item	       **on_expr_ref;   /**< pointer to the associated on expression   */
+  /*
+    Pointer to the associated ON expression. on_expr_ref=!NULL except for
+    degenerate joins. 
+    *on_expr_ref!=NULL for tables that are first inner tables within an outer
+    join.
+  */
+  Item	       **on_expr_ref;
   COND_EQUAL    *cond_equal;    /**< multiple equalities for the on expression */
   st_join_table *first_inner;   /**< first inner table for including outerjoin */
   bool           found;         /**< true after all matches or null complement */
@@ -183,6 +214,21 @@ typedef struct st_join_table {
   st_join_table *last_inner;    /**< last table table for embedding outer join */
   st_join_table *first_upper;  /**< first inner table for embedding outer join */
   st_join_table *first_unmatched; /**< used for optimization purposes only     */
+
+  /*
+    For join tabs that are inside an SJM bush: root of the bush
+  */
+  st_join_table *bush_root_tab;
+
+  /* TRUE <=> This join_tab is inside an SJM bush and is the last leaf tab here */
+  bool          last_leaf_in_bush;
+  
+  /*
+    ptr  - this is a bush, and ptr points to description of child join_tab
+           range
+    NULL - this join tab has no bush children
+  */
+  JOIN_TAB_RANGE *bush_children;
   
   /* Special content for EXPLAIN 'Extra' column or NULL if none */
   const char	*info;
@@ -220,12 +266,23 @@ typedef struct st_join_table {
     method (but not 'index' for some reason), i.e. this matches method which
     E(#records) is in found_records.
   */
-  ha_rows       read_time;
+  double        read_time;
+  
+  /* psergey-todo: make the below have type double, like POSITION::records_read? */
+  ha_rows       records_read;
   
+  /* Startup cost for execution */
+  double        startup_cost;
+    
+  double        partial_join_cardinality;
+
   table_map	dependent,key_dependent;
   uint		use_quick,index;
   uint		status;				///< Save status for cache
-  uint		used_fields,used_fieldlength,used_blobs;
+  uint		used_fields;
+  ulong         used_fieldlength;
+  ulong         max_used_fieldlength;
+  uint          used_blobs;
   uint          used_null_fields;
   uint          used_rowid_fields;
   uint          used_uneven_bit_fields;
@@ -239,7 +296,16 @@ typedef struct st_join_table {
   */ 
   ha_rows       limit; 
   TABLE_REF	ref;
+  /* TRUE <=> condition pushdown supports other tables presence */
+  bool          icp_other_tables_ok;
+  /* 
+    TRUE <=> condition pushed to the index has to be factored out of
+    the condition pushed to the table
+  */
+  bool          idx_cond_fact_out;
   bool          use_join_cache;
+  uint          used_join_cache_level;
+  ulong         join_buffer_size_limit;
   JOIN_CACHE	*cache;
   /*
     Index condition for BKA access join
@@ -299,10 +365,14 @@ typedef struct st_join_table {
   /*
     Semi-join strategy to be used for this join table. This is a copy of
     POSITION::sj_strategy field. This field is set up by the
-    fix_semijion_strategies_for_picked_join_order.
+    fix_semijoin_strategies_for_picked_join_order.
   */
   uint sj_strategy;
 
+  uint n_sj_tables;
+
+  bool preread_init_done;
+
   void cleanup();
   inline bool is_using_loose_index_scan()
   {
@@ -359,6 +429,19 @@ typedef struct st_join_table {
     return (first_inner && first_inner->last_inner == this) ||
            last_sj_inner_tab == this;
   }
+  /*
+    Check whether the table belongs to a nest of inner tables of an
+    outer join or to a nest of inner tables of a semi-join
+  */
+  bool is_nested_inner()
+  {
+    if (first_inner && 
+        (first_inner != first_inner->last_inner || first_inner->first_upper))
+      return TRUE;
+    if (first_sj_inner_tab && first_sj_inner_tab != last_sj_inner_tab)
+      return TRUE;
+    return FALSE;
+  }
   struct st_join_table *get_first_inner_table()
   {
     if (first_inner)
@@ -379,858 +462,40 @@ typedef struct st_join_table {
       select->cond= new_cond;
     return tmp_select_cond;
   }
-} JOIN_TAB;
-
-
-/* 
-  Categories of data fields of variable length written into join cache buffers.
-  The value of any of these fields is written into cache together with the
-  prepended length of the value.     
-*/
-#define CACHE_BLOB      1        /* blob field  */
-#define CACHE_STRIPPED  2        /* field stripped of trailing spaces */
-#define CACHE_VARSTR1   3        /* short string value (length takes 1 byte) */ 
-#define CACHE_VARSTR2   4        /* long string value (length takes 2 bytes) */
-
-/*
-  The CACHE_FIELD structure used to describe fields of records that
-  are written into a join cache buffer from record buffers and backward.
-*/
-typedef struct st_cache_field {
-  uchar *str;   /**< buffer from/to where the field is to be copied */ 
-  uint length;  /**< maximal number of bytes to be copied from/to str */
-  /* 
-    Field object for the moved field
-    (0 - for a flag field, see JOIN_CACHE::create_flag_fields).
-  */
-  Field *field;
-  uint type;    /**< category of the of the copied field (CACHE_BLOB et al.) */
-  /* 
-    The number of the record offset value for the field in the sequence
-    of offsets placed after the last field of the record. These
-    offset values are used to access fields referred to from other caches.
-    If the value is 0 then no offset for the field is saved in the
-    trailing sequence of offsets.
-  */ 
-  uint referenced_field_no; 
-  /* The remaining structure fields are used as containers for temp values */
-  uint blob_length; /**< length of the blob to be copied */
-  uint offset;      /**< field offset to be saved in cache buffer */
-} CACHE_FIELD;
-
-
-/*
-  JOIN_CACHE is the base class to support the implementations of both
-  Blocked-Based Nested Loops (BNL) Join Algorithm and Batched Key Access (BKA)
-  Join Algorithm. The first algorithm is supported by the derived class
-  JOIN_CACHE_BNL, while the second algorithm is supported by the derived
-  class JOIN_CACHE_BKA.
-  These two algorithms have a lot in common. Both algorithms first
-  accumulate the records of the left join operand in a join buffer and
-  then search for matching rows of the second operand for all accumulated
-  records.
-  For the first algorithm this strategy saves on logical I/O operations:
-  the entire set of records from the join buffer requires only one look-through
-  the records provided by the second operand. 
-  For the second algorithm the accumulation of records allows to optimize
-  fetching rows of the second operand from disk for some engines (MyISAM, 
-  InnoDB), or to minimize the number of round-trips between the Server and
-  the engine nodes (NDB Cluster).        
-*/ 
-
-class JOIN_CACHE :public Sql_alloc
-{
-
-private:
-
-  /* Size of the offset of a record from the cache */   
-  uint size_of_rec_ofs;    
-  /* Size of the length of a record in the cache */
-  uint size_of_rec_len;
-  /* Size of the offset of a field within a record in the cache */   
-  uint size_of_fld_ofs;
-
-protected:
-       
-  /* 3 functions below actually do not use the hidden parameter 'this' */ 
-
-  /* Calculate the number of bytes used to store an offset value */
-  uint offset_size(uint len)
-  { return (len < 256 ? 1 : len < 256*256 ? 2 : 4); }
-
-  /* Get the offset value that takes ofs_sz bytes at the position ptr */
-  ulong get_offset(uint ofs_sz, uchar *ptr)
-  {
-    switch (ofs_sz) {
-    case 1: return uint(*ptr);
-    case 2: return uint2korr(ptr);
-    case 4: return uint4korr(ptr);
-    }
-    return 0;
-  }
-
-  /* Set the offset value ofs that takes ofs_sz bytes at the position ptr */ 
-  void store_offset(uint ofs_sz, uchar *ptr, ulong ofs)
-  {
-    switch (ofs_sz) {
-    case 1: *ptr= (uchar) ofs; return;
-    case 2: int2store(ptr, (uint16) ofs); return;
-    case 4: int4store(ptr, (uint32) ofs); return;
-    }
-  }
-  
-  /* 
-    The total maximal length of the fields stored for a record in the cache.
-    For blob fields only the sizes of the blob lengths are taken into account. 
-  */
-  uint length;
-
-  /* 
-    Representation of the executed multi-way join through which all needed
-    context can be accessed.  
-  */   
-  JOIN *join;  
-
-  /* 
-    Cardinality of the range of join tables whose fields can be put into the
-    cache. (A table from the range not necessarily contributes to the cache.)
-  */
-  uint tables;
-
-  /* 
-    The total number of flag and data fields that can appear in a record
-    written into the cache. Fields with null values are always skipped 
-    to save space. 
-  */
-  uint fields;
-
-  /* 
-    The total number of flag fields in a record put into the cache. They are
-    used for table null bitmaps, table null row flags, and an optional match
-    flag. Flag fields go before other fields in a cache record with the match
-    flag field placed always at the very beginning of the record.
-  */
-  uint flag_fields;
-
-  /* The total number of blob fields that are written into the cache */ 
-  uint blobs;
-
-  /* 
-    The total number of fields referenced from field descriptors for other join
-    caches. These fields are used to construct key values to access matching
-    rows with index lookups. Currently the fields can be referenced only from
-    descriptors for bka caches. However they may belong to a cache of any type.
-  */   
-  uint referenced_fields;
-   
-  /* 
-    The current number of already created data field descriptors.
-    This number can be useful for implementations of the init methods.  
-  */
-  uint data_field_count; 
-
-  /* 
-    The current number of already created pointers to the data field
-    descriptors. This number can be useful for implementations of
-    the init methods.  
-  */
-  uint data_field_ptr_count; 
-  /* 
-    Array of the descriptors of fields containing 'fields' elements.
-    These are all fields that are stored for a record in the cache. 
-  */
-  CACHE_FIELD *field_descr;
-
-  /* 
-    Array of pointers to the blob descriptors that contains 'blobs' elements.
-  */
-  CACHE_FIELD **blob_ptr;
-
-  /* 
-    This flag indicates that records written into the join buffer contain
-    a match flag field. The flag must be set by the init method. 
-  */
-  bool with_match_flag; 
-  /*
-    This flag indicates that any record is prepended with the length of the
-    record which allows us to skip the record or part of it without reading.
-  */
-  bool with_length;
-
-  /* 
-    The maximal number of bytes used for a record representation in
-    the cache excluding the space for blob data. 
-    For future derived classes this representation may contains some
-    redundant info such as a key value associated with the record.     
-  */
-  uint pack_length;
-  /* 
-    The value of pack_length incremented by the total size of all 
-    pointers of a record in the cache to the blob data. 
-  */
-  uint pack_length_with_blob_ptrs;
-
-  /* Pointer to the beginning of the join buffer */
-  uchar *buff;         
-  /* 
-    Size of the entire memory allocated for the join buffer.
-    Part of this memory may be reserved for the auxiliary buffer.
-  */ 
-  ulong buff_size;
-  /* Size of the auxiliary buffer. */ 
-  ulong aux_buff_size;
-
-  /* The number of records put into the join buffer */ 
-  uint records;
-
-  /* 
-    Pointer to the current position in the join buffer.
-    This member is used both when writing to buffer and
-    when reading from it.
-  */
-  uchar *pos;
-  /* 
-    Pointer to the first free position in the join buffer,
-    right after the last record into it.
-  */
-  uchar *end_pos; 
-
-  /* 
-    Pointer to the beginning of first field of the current read/write record
-    from the join buffer. The value is adjusted by the get_record/put_record
-    functions.
-  */
-  uchar *curr_rec_pos;
-  /* 
-    Pointer to the beginning of first field of the last record
-    from the join buffer.
-  */
-  uchar *last_rec_pos;
-
-  /* 
-    Flag is set if the blob data for the last record in the join buffer
-    is in record buffers rather than in the join cache.
-  */
-  bool last_rec_blob_data_is_in_rec_buff;
-
-  /* 
-    Pointer to the position to the current record link. 
-    Record links are used only with linked caches. Record links allow to set
-    connections between parts of one join record that are stored in different
-    join buffers.
-    In the simplest case a record link is just a pointer to the beginning of
-    the record stored in the buffer.
-    In a more general case a link could be a reference to an array of pointers
-    to records in the buffer.   */
-  uchar *curr_rec_link;
-
-  void calc_record_fields();     
-  int alloc_fields(uint external_fields);
-  void create_flag_fields();
-  void create_remaining_fields(bool all_read_fields);
-  void set_constants();
-  int alloc_buffer();
-
-  uint get_size_of_rec_offset() { return size_of_rec_ofs; }
-  uint get_size_of_rec_length() { return size_of_rec_len; }
-  uint get_size_of_fld_offset() { return size_of_fld_ofs; }
-
-  uchar *get_rec_ref(uchar *ptr)
-  {
-    return buff+get_offset(size_of_rec_ofs, ptr-size_of_rec_ofs);
-  }
-  ulong get_rec_length(uchar *ptr)
-  { 
-    return (ulong) get_offset(size_of_rec_len, ptr);
-  }
-  ulong get_fld_offset(uchar *ptr)
-  { 
-    return (ulong) get_offset(size_of_fld_ofs, ptr);
-  }
-
-  void store_rec_ref(uchar *ptr, uchar* ref)
-  {
-    store_offset(size_of_rec_ofs, ptr-size_of_rec_ofs, (ulong) (ref-buff));
-  }
-
-  void store_rec_length(uchar *ptr, ulong len)
-  {
-    store_offset(size_of_rec_len, ptr, len);
-  }
-  void store_fld_offset(uchar *ptr, ulong ofs)
-  {
-    store_offset(size_of_fld_ofs, ptr, ofs);
-  }
-
-  /* Write record fields and their required offsets into the join buffer */ 
-  uint write_record_data(uchar *link, bool *is_full);
-
-  /* 
-    This method must determine for how much the auxiliary buffer should be
-    incremented when a new record is added to the join buffer.
-    If no auxiliary buffer is needed the function should return 0.
-  */
-  virtual uint aux_buffer_incr() { return 0; }
-
-  /* Shall calculate how much space is remaining in the join buffer */ 
-  virtual ulong rem_space() 
-  { 
-    return max(buff_size-(end_pos-buff)-aux_buff_size,0);
-  }
-
-  /* Shall skip record from the join buffer if its match flag is on */
-  virtual bool skip_record_if_match();
-
-  /*  Read all flag and data fields of a record from the join buffer */
-  uint read_all_record_fields();
-  
-  /* Read all flag fields of a record from the join buffer */
-  uint read_flag_fields();
-
-  /* Read a data record field from the join buffer */
-  uint read_record_field(CACHE_FIELD *copy, bool last_record);
-
-  /* Read a referenced field from the join buffer */
-  bool read_referenced_field(CACHE_FIELD *copy, uchar *rec_ptr, uint *len);
-
-  /* 
-    True if rec_ptr points to the record whose blob data stay in
-    record buffers
-  */
-  bool blob_data_is_in_rec_buff(uchar *rec_ptr)
-  {
-    return rec_ptr == last_rec_pos && last_rec_blob_data_is_in_rec_buff;
-  }
-
-  /* Find matches from the next table for records from the join buffer */   
-  virtual enum_nested_loop_state join_matching_records(bool skip_last)=0;
-
-  /* Add null complements for unmatched outer records from buffer */
-  virtual enum_nested_loop_state join_null_complements(bool skip_last);
-
-  /* Restore the fields of the last record from the join buffer */
-  virtual void restore_last_record();
-
-  /*Set match flag for a record in join buffer if it has not been set yet */
-  bool set_match_flag_if_none(JOIN_TAB *first_inner, uchar *rec_ptr);
-
-  enum_nested_loop_state generate_full_extensions(uchar *rec_ptr);
-
-  /* Check matching to a partial join record from the join buffer */
-  bool check_match(uchar *rec_ptr);
-
-public:
-
-  /* Table to be joined with the partial join records from the cache */ 
-  JOIN_TAB *join_tab;
-
-  /* Pointer to the previous join cache if there is any */
-  JOIN_CACHE *prev_cache;
-  /* Pointer to the next join cache if there is any */
-  JOIN_CACHE *next_cache;
-
-  /* Shall initialize the join cache structure */ 
-  virtual int init()=0;  
-
-  /* The function shall return TRUE only for BKA caches */
-  virtual bool is_key_access() { return FALSE; }
-
-  /* Shall reset the join buffer for reading/writing */
-  virtual void reset(bool for_writing);
-
-  /* 
-    This function shall add a record into the join buffer and return TRUE
-    if it has been decided that it should be the last record in the buffer.
-  */ 
-  virtual bool put_record();
-
-  /* 
-    This function shall read the next record into the join buffer and return
-    TRUE if there is no more next records.
-  */ 
-  virtual bool get_record();
-
-  /* 
-    This function shall read the record at the position rec_ptr
-    in the join buffer
-  */ 
-  virtual void get_record_by_pos(uchar *rec_ptr);
-
-  /* Shall return the value of the match flag for the positioned record */
-  virtual bool get_match_flag_by_pos(uchar *rec_ptr);
-
-  /* Shall return the position of the current record */
-  virtual uchar *get_curr_rec() { return curr_rec_pos; }
-
-  /* Shall set the current record link */
-  virtual void set_curr_rec_link(uchar *link) { curr_rec_link= link; }
-
-  /* Shall return the current record link */
-  virtual uchar *get_curr_rec_link()
-  { 
-    return (curr_rec_link ? curr_rec_link : get_curr_rec());
-  }
-     
-  /* Join records from the join buffer with records from the next join table */    
-  enum_nested_loop_state join_records(bool skip_last);
-
-  virtual ~JOIN_CACHE() {}
-  void reset_join(JOIN *j) { join= j; }
-  void free()
-  { 
-    my_free(buff);
-    buff= 0;
-  }   
-  
-  friend class JOIN_CACHE_BNL;
-  friend class JOIN_CACHE_BKA;
-  friend class JOIN_CACHE_BKA_UNIQUE;
-};
-
-
-class JOIN_CACHE_BNL :public JOIN_CACHE
-{
-
-protected:
-
-  /* Using BNL find matches from the next table for records from join buffer */
-  enum_nested_loop_state join_matching_records(bool skip_last);
-
-public:
-
-  /* 
-    This constructor creates an unlinked BNL join cache. The cache is to be
-    used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter.
-  */   
-  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab)
-  { 
-    join= j;
-    join_tab= tab;
-    prev_cache= next_cache= 0;
-  }
-
-  /* 
-    This constructor creates a linked BNL join cache. The cache is to be 
-    used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter. The parameter 'prev' specifies the previous
-    cache object to which this cache is linked.
-  */   
-  JOIN_CACHE_BNL(JOIN *j, JOIN_TAB *tab, JOIN_CACHE *prev)
-  { 
-    join= j;
-    join_tab= tab;
-    prev_cache= prev;
-    next_cache= 0;
-    if (prev)
-      prev->next_cache= this;
-  }
-
-  /* Initialize the BNL cache */       
-  int init();
-
-};
-
-class JOIN_CACHE_BKA :public JOIN_CACHE
-{
-protected:
-
-  /* Flag to to be passed to the MRR interface */ 
-  uint mrr_mode;
-
-  /* MRR buffer assotiated with this join cache */
-  HANDLER_BUFFER mrr_buff;
-
-  /* Shall initialize the MRR buffer */
-  virtual void init_mrr_buff()
+  void calc_used_field_length(bool max_fl);
+  ulong get_used_fieldlength()
   {
-    mrr_buff.buffer= end_pos;
-    mrr_buff.buffer_end= buff+buff_size;
-  }
-
-  /*
-    The number of the cache fields that are used in building keys to access
-    the table join_tab
-  */
-  uint local_key_arg_fields;
-  /* 
-    The total number of the fields in the previous caches that are used
-    in building keys t access the table join_tab
-  */
-  uint external_key_arg_fields;
-
-  /* 
-    This flag indicates that the key values will be read directly from the join
-    buffer. It will save us building key values in the key buffer.
-  */
-  bool use_emb_key;
-  /* The length of an embedded key value */ 
-  uint emb_key_length;
-
-  /* Check the possibility to read the access keys directly from join buffer */  
-  bool check_emb_key_usage();
-
-  /* Calculate the increment of the MM buffer for a record write */
-  uint aux_buffer_incr();
-
-  /* Using BKA find matches from the next table for records from join buffer */
-  enum_nested_loop_state join_matching_records(bool skip_last);
-
-  /* Prepare to search for records that match records from the join buffer */
-  enum_nested_loop_state init_join_matching_records(RANGE_SEQ_IF *seq_funcs,
-                                                    uint ranges);
-
-  /* Finish searching for records that match records from the join buffer */
-  enum_nested_loop_state end_join_matching_records(enum_nested_loop_state rc);
-
-public:
-  
-  /* 
-    This constructor creates an unlinked BKA join cache. The cache is to be
-    used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter.
-    The MRR mode initially is set to 'flags'.
-  */   
-  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags)
-  { 
-    join= j;
-    join_tab= tab;
-    prev_cache= next_cache= 0;
-    mrr_mode= flags;
-  }
-
-  /* 
-    This constructor creates a linked BKA join cache. The cache is to be 
-    used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter. The parameter 'prev' specifies the cache
-    object to which this cache is linked.
-    The MRR mode initially is set to 'flags'.
-  */   
-  JOIN_CACHE_BKA(JOIN *j, JOIN_TAB *tab, uint flags,  JOIN_CACHE* prev)
-  { 
-    join= j;
-    join_tab= tab;
-    prev_cache= prev;
-    next_cache= 0;
-    if (prev)
-      prev->next_cache= this;
-    mrr_mode= flags;
+    if (!used_fieldlength)
+      calc_used_field_length(FALSE);
+    return used_fieldlength;
   }
-
-  /* Initialize the BKA cache */       
-  int init();
-
-  bool is_key_access() { return TRUE; }
-
-  /* Shall get the key built over the next record from the join buffer */
-  virtual uint get_next_key(uchar **key);
-
-  /* Check if the record combination matches the index condition */
-  bool skip_index_tuple(range_seq_t rseq, char *range_info);
-};
-
-/*
-  The class JOIN_CACHE_BKA_UNIQUE supports the variant of the BKA join algorithm
-  that submits only distinct keys to the MRR interface. The records in the join
-  buffer of a cache of this class that have the same access key are linked into
-  a chain attached to a key entry structure that either itself contains the key
-  value, or, in the case when the keys are embedded, refers to its occurance in
-  one of the records from the chain.
-  To build the chains with the same keys a hash table is employed. It is placed
-  at the very end of the join buffer. The array of hash entries is allocated
-  first at the very bottom of the join buffer, then go key entries. A hash entry
-  contains a header of the list of the key entries with the same hash value. 
-  Each key entry is a structure of the following type:
-    struct st_join_cache_key_entry {
-      union { 
-        uchar[] value;
-        cache_ref *value_ref; // offset from the beginning of the buffer
-      } hash_table_key;
-      key_ref next_key; // offset backward from the beginning of hash table
-      cache_ref *last_rec // offset from the beginning of the buffer
-    }
-  The references linking the records in a chain are always placed at the very
-  beginning of the record info stored in the join buffer. The records are 
-  linked in a circular list. A new record is always added to the end of this 
-  list. When a key is passed to the MRR interface it can be passed either with
-  an association link containing a reference to the header of the record chain
-  attached to the corresponding key entry in the hash table, or without any
-  association link. When the next record is returned by a call to the MRR 
-  function multi_range_read_next without any association (because if was not
-  passed  together with the key) then the key value is extracted from the
-  returned record and searched for it in the hash table. If there is any records
-  with such key the chain of them will be yielded as the result of this search.
-
-  The following picture represents a typical layout for the info stored in the
-  join buffer of a join cache object of the JOIN_CACHE_BKA_UNIQUE class.
-    
-  buff
-  V
-  +----------------------------------------------------------------------------+
-  |     |[*]record_1_1|                                                        |
-  |     ^ |                                                                    |
-  |     | +--------------------------------------------------+                 |
-  |     |                           |[*]record_2_1|          |                 |
-  |     |                           ^ |                      V                 |
-  |     |                           | +------------------+   |[*]record_1_2|   |
-  |     |                           +--------------------+-+   |               |
-  |+--+ +---------------------+                          | |   +-------------+ |
-  ||  |                       |                          V |                 | |
-  |||[*]record_3_1|         |[*]record_1_3|              |[*]record_2_2|     | |
-  ||^                       ^                            ^                   | |
-  ||+----------+            |                            |                   | |
-  ||^          |            |<---------------------------+-------------------+ |
-  |++          | | ... mrr  |   buffer ...           ... |     |               |
-  |            |            |                            |                     |
-  |      +-----+--------+   |                      +-----|-------+             |
-  |      V     |        |   |                      V     |       |             |
-  ||key_3|[/]|[*]|      |   |                |key_2|[/]|[*]|     |             |
-  |                   +-+---|-----------------------+            |             |
-  |                   V |   |                       |            |             |
-  |             |key_1|[*]|[*]|         |   | ... |[*]|   ...  |[*]|  ...  |   |
-  +----------------------------------------------------------------------------+
-                                        ^           ^            ^
-                                        |           i-th entry   j-th entry
-                                        hash table
-
-  i-th hash entry:
-    circular record chain for key_1:
-      record_1_1
-      record_1_2
-      record_1_3 (points to record_1_1)
-    circular record chain for key_3:
-      record_3_1 (points to itself)
-
-  j-th hash entry:
-    circular record chain for key_2:
-      record_2_1
-      record_2_2 (points to record_2_1)
-
-*/
-
-class JOIN_CACHE_BKA_UNIQUE :public JOIN_CACHE_BKA
-{
-
-private:
-
-  /* Size of the offset of a key entry in the hash table */
-  uint size_of_key_ofs;
-
-  /* 
-    Length of a key value.
-    It is assumed that all key values have the same length.
-  */
-  uint key_length;
-  /* 
-    Length of the key entry in the hash table.
-    A key entry either contains the key value, or it contains a reference
-    to the key value if use_emb_key flag is set for the cache.
-  */ 
-  uint key_entry_length;
- 
-  /* The beginning of the hash table in the join buffer */
-  uchar *hash_table;
-  /* Number of hash entries in the hash table */
-  uint hash_entries;
-
-  /* Number of key entries in the hash table (number of distinct keys) */
-  uint key_entries;
-
-  /* The position of the last key entry in the hash table */
-  uchar *last_key_entry;
-
-  /* The position of the currently retrieved key entry in the hash table */
-  uchar *curr_key_entry;
-
-  /* 
-    The offset of the record fields from the beginning of the record
-    representation. The record representation starts with a reference to
-    the next record in the key record chain followed by the length of
-    the trailing record data followed by a reference to the record segment
-     in the previous cache, if any, followed by the record fields.
-  */ 
-  uint rec_fields_offset;
-  /* The offset of the data fields from the beginning of the record fields */
-  uint data_fields_offset;
-  
-  uint get_hash_idx(uchar* key, uint key_len);
-
-  void cleanup_hash_table();
-  
-protected:
-
-  uint get_size_of_key_offset() { return size_of_key_ofs; }
-
-  /* 
-    Get the position of the next_key_ptr field pointed to by 
-    a linking reference stored at the position key_ref_ptr. 
-    This reference is actually the offset backward from the
-    beginning of hash table.
-  */  
-  uchar *get_next_key_ref(uchar *key_ref_ptr)
+  ulong get_max_used_fieldlength()
   {
-    return hash_table-get_offset(size_of_key_ofs, key_ref_ptr);
+    if (!max_used_fieldlength)
+      calc_used_field_length(TRUE);
+    return max_used_fieldlength;
   }
-
-  /* 
-    Store the linking reference to the next_key_ptr field at 
-    the position key_ref_ptr. The position of the next_key_ptr
-    field is pointed to by ref. The stored reference is actually
-    the offset backward from the beginning of the hash table.
-  */  
-  void store_next_key_ref(uchar *key_ref_ptr, uchar *ref)
-  {
-    store_offset(size_of_key_ofs, key_ref_ptr, (ulong) (hash_table-ref));
-  }     
-  
-  /* 
-    Check whether the reference to the next_key_ptr field at the position
-    key_ref_ptr contains  a nil value.
-  */
-  bool is_null_key_ref(uchar *key_ref_ptr)
-  {
-    ulong nil= 0;
-    return memcmp(key_ref_ptr, &nil, size_of_key_ofs ) == 0;
-  } 
-
-  /* 
-    Set the reference to the next_key_ptr field at the position
-    key_ref_ptr equal to nil.
-  */
-  void store_null_key_ref(uchar *key_ref_ptr)
+  double get_partial_join_cardinality() { return partial_join_cardinality; }
+  bool hash_join_is_possible();
+  int make_scan_filter();
+  bool is_ref_for_hash_join() { return is_hash_join_key_no(ref.key); }
+  KEY *get_keyinfo_by_key_no(uint key) 
   {
-    ulong nil= 0;
-    store_offset(size_of_key_ofs, key_ref_ptr, nil);
-  } 
-
-  uchar *get_next_rec_ref(uchar *ref_ptr)
-  {
-    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
+    return (is_hash_join_key_no(key) ? hj_key : table->key_info+key);
   }
+  double scan_time();
+  bool preread_init();
 
-  void store_next_rec_ref(uchar *ref_ptr, uchar *ref)
-  {
-    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
-  }     
- 
-  /*
-    Get the position of the embedded key value for the current
-    record pointed to by get_curr_rec().
-  */ 
-  uchar *get_curr_emb_key()
-  {
-    return get_curr_rec()+data_fields_offset;
-  }
-
-  /*
-    Get the position of the embedded key value pointed to by a reference
-    stored at ref_ptr. The stored reference is actually the offset from
-    the beginning of the join buffer.
-  */  
-  uchar *get_emb_key(uchar *ref_ptr)
-  {
-    return buff+get_offset(get_size_of_rec_offset(), ref_ptr);
-  }
-
-  /* 
-    Store the reference to an embedded key at the position key_ref_ptr.
-    The position of the embedded key is pointed to by ref. The stored
-    reference is actually the offset from the beginning of the join buffer.
-  */  
-  void store_emb_key_ref(uchar *ref_ptr, uchar *ref)
-  {
-    store_offset(get_size_of_rec_offset(), ref_ptr, (ulong) (ref-buff));
-  }
-  
-  /* 
-    Calculate how much space in the buffer would not be occupied by
-    records, key entries and additional memory for the MMR buffer.
-  */ 
-  ulong rem_space() 
-  { 
-    return max(last_key_entry-end_pos-aux_buff_size,0);
-  }
-
-  /* 
-    Initialize the MRR buffer allocating some space within the join buffer.
-    The entire space between the last record put into the join buffer and the
-    last key entry added to the hash table is used for the MRR buffer.
-  */
-  void init_mrr_buff()
-  {
-    mrr_buff.buffer= end_pos;
-    mrr_buff.buffer_end= last_key_entry;
-  }
-
-  /* Skip record from JOIN_CACHE_BKA_UNIQUE buffer if its match flag is on */
-  bool skip_record_if_match();
-
-  /* Using BKA_UNIQUE find matches for records from join buffer */
-  enum_nested_loop_state join_matching_records(bool skip_last);
-
-  /* Search for a key in the hash table of the join buffer */
-  bool key_search(uchar *key, uint key_len, uchar **key_ref_ptr);
-
-public:
-
-  /* 
-    This constructor creates an unlinked BKA_UNIQUE join cache. The cache is
-    to be used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter.
-    The MRR mode initially is set to 'flags'.
-  */   
-  JOIN_CACHE_BKA_UNIQUE(JOIN *j, JOIN_TAB *tab, uint flags)
-    :JOIN_CACHE_BKA(j, tab, flags) {}
-
-  /* 
-    This constructor creates a linked BKA_UNIQUE join cache. The cache is
-    to be used to join table 'tab' to the result of joining the previous tables 
-    specified by the 'j' parameter. The parameter 'prev' specifies the cache
-    object to which this cache is linked.
-    The MRR mode initially is set to 'flags'.
-  */   
-  JOIN_CACHE_BKA_UNIQUE(JOIN *j, JOIN_TAB *tab, uint flags,  JOIN_CACHE* prev)
-    :JOIN_CACHE_BKA(j, tab, flags, prev) {}
-
-  /* Initialize the BKA_UNIQUE cache */       
-  int init();
-
-  /* Reset the JOIN_CACHE_BKA_UNIQUE  buffer for reading/writing */
-  void reset(bool for_writing);
-
-  /* Add a record into the JOIN_CACHE_BKA_UNIQUE buffer */
-  bool put_record();
-
-  /* Read the next record from the JOIN_CACHE_BKA_UNIQUE buffer */
-  bool get_record();
+  bool is_sjm_nest() { return test(bush_children); }
+} JOIN_TAB;
 
-  /*
-    Shall check whether all records in a key chain have 
-    their match flags set on
-  */   
-  virtual bool check_all_match_flags_for_key(uchar *key_chain_ptr);
-
-  uint get_next_key(uchar **key); 
-  
-  /* Get the head of the record chain attached to the current key entry */ 
-  uchar *get_curr_key_chain()
-  {
-    return get_next_rec_ref(curr_key_entry+key_entry_length-
-                            get_size_of_rec_offset());
-  }
-  
-  /* Check if the record combination matches the index condition */
-  bool skip_index_tuple(range_seq_t rseq, char *range_info);
-};
 
+#include "sql_join_cache.h"
 
 enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
                                         end_of_records);
 enum_nested_loop_state sub_select(JOIN *join,JOIN_TAB *join_tab, bool
                                   end_of_records);
-enum_nested_loop_state sub_select_sjm(JOIN *join, JOIN_TAB *join_tab, 
-                                      bool end_of_records);
-
 enum_nested_loop_state
 end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 	       bool end_of_records);
@@ -1332,6 +597,8 @@ typedef struct st_position
    */
   table_map firstmatch_need_tables;
 
+  bool in_firstmatch_prefix() { return (first_firstmatch_table != MAX_TABLES); }
+  void invalidate_firstmatch_prefix() { first_firstmatch_table= MAX_TABLES; }
 
 /* Duplicate Weedout strategy */
   /* The first table that the strategy will need to handle */
@@ -1351,6 +618,8 @@ typedef struct st_position
     semi-join's ON expression so we can correctly account for fanout.
   */
   table_map sjm_scan_need_tables;
+
+  table_map prefix_dups_producing_tables;
 } POSITION;
 
 
@@ -1376,27 +645,103 @@ inline bool sj_is_materialize_strategy(uint strategy)
   return strategy >= SJ_OPT_MATERIALIZE;
 }
 
+class JOIN_TAB_RANGE: public Sql_alloc
+{
+public:
+  JOIN_TAB *start;
+  JOIN_TAB *end;
+};
+
 
 class JOIN :public Sql_alloc
 {
+private:
   JOIN(const JOIN &rhs);                        /**< not implemented */
   JOIN& operator=(const JOIN &rhs);             /**< not implemented */
+
+protected:
+
+  /**
+    The subset of the state of a JOIN that represents an optimized query
+    execution plan. Allows saving/restoring different JOIN plans for the same
+    query.
+  */
+  class Join_plan_state {
+  public:
+    DYNAMIC_ARRAY keyuse; /* Copy of the JOIN::keyuse array. */
+    POSITION best_positions[MAX_TABLES+1]; /* Copy of JOIN::best_positions */
+    /* Copies of the JOIN_TAB::keyuse pointers for each JOIN_TAB. */
+    KEYUSE *join_tab_keyuse[MAX_TABLES];
+    /* Copies of JOIN_TAB::checked_keys for each JOIN_TAB. */
+    key_map join_tab_checked_keys[MAX_TABLES];
+  public:
+    Join_plan_state()
+    {   
+      keyuse.elements= 0;
+      keyuse.buffer= NULL;
+    }
+    Join_plan_state(JOIN *join);
+    ~Join_plan_state()
+    {
+      delete_dynamic(&keyuse);
+    }
+  };
+
+  /* Results of reoptimizing a JOIN via JOIN::reoptimize(). */
+  enum enum_reopt_result {
+    REOPT_NEW_PLAN, /* there is a new reoptimized plan */
+    REOPT_OLD_PLAN, /* no new improved plan can be found, use the old one */
+    REOPT_ERROR,    /* an irrecovarable error occured during reoptimization */
+    REOPT_NONE      /* not yet reoptimized */
+  };
+
+  /* Support for plan reoptimization with rewritten conditions. */
+  enum_reopt_result reoptimize(Item *added_where, table_map join_tables,
+                               Join_plan_state *save_to);
+  void save_query_plan(Join_plan_state *save_to);
+  void restore_query_plan(Join_plan_state *restore_from);
+  /* Choose a subquery plan for a table-less subquery. */
+  bool choose_tableless_subquery_plan();
+
 public:
-  JOIN_TAB *join_tab,**best_ref;
+  JOIN_TAB *join_tab, **best_ref;
   JOIN_TAB **map2table;    ///< mapping between table indexes and JOIN_TABs
   JOIN_TAB *join_tab_save; ///< saved join_tab for subquery reexecution
-  TABLE    **all_tables;
+
+  List<JOIN_TAB_RANGE> join_tab_ranges;
+  
+  /*
+    Base tables participating in the join. After join optimization is done, the
+    tables are stored in the join order (but the only really important part is 
+    that const tables are first).
+  */
+  TABLE    **table;
   /**
     The table which has an index that allows to produce the requried ordering.
     A special value of 0x1 means that the ordering will be produced by
     passing 1st non-const table to filesort(). NULL means no such table exists.
   */
   TABLE    *sort_by_table;
-  uint	   tables;        /**< Number of tables in the join */
+  /* 
+    Number of tables in the join. 
+    (In MySQL, it is named 'tables' and is also the number of elements in 
+     join->join_tab array. In MariaDB, the latter is not true, so we've renamed
+     the variable)
+  */
+  uint	   table_count;
   uint     outer_tables;  /**< Number of tables that are not inside semijoin */
   uint     const_tables;
+  /* 
+    Number of tables in the top join_tab array. Normally this matches
+    (join_tab_ranges.head()->end - join_tab_ranges.head()->start). 
+    
+    We keep it here so that it is saved/restored with JOIN::restore_tmp.
+  */
+  uint     top_join_tab_count;
   uint	   send_group_parts;
   bool	   group;          /**< If query contains GROUP BY clause */
+  bool     need_distinct;
+
   /**
     Indicates that grouping will be performed on the result set during
     query execution. This field belongs to query execution.
@@ -1416,9 +761,12 @@ public:
   /* Tables removed by table elimination. Set to 0 before the elimination. */
   table_map eliminated_tables;
   /*
-     Bitmap of all inner tables from outer joins
+     Bitmap of all inner tables from outer joins (set at start of
+     make_join_statistics)
   */
   table_map outer_join;
+  /* Bitmap of tables used in the select list items */
+  table_map select_list_used_tables;
   ha_rows  send_records,found_records,examined_rows,row_limit, select_limit;
   /**
     Used to fetch no more than given amount of rows per one
@@ -1465,13 +813,19 @@ public:
 
   /* We also maintain a stack of join optimization states in * join->positions[] */
 /******* Join optimization state members end *******/
-  Next_select_func first_select;
   /*
     The cost of best complete join plan found so far during optimization,
     after optimization phase - cost of picked join order (not taking into
     account the changes made by test_if_skip_sort_order()).
   */
   double   best_read;
+  /*
+    Estimated result rows (fanout) of the join operation. If this is a subquery
+    that is reexecuted multiple times, this value includes the estiamted # of
+    reexecutions. This value is equal to the multiplication of all
+    join->positions[i].records_read of a JOIN.
+  */
+  double   record_count;
   List<Item> *fields;
   List<Cached_item> group_fields, group_fields_cache;
   TABLE    *tmp_table;
@@ -1486,6 +840,15 @@ public:
   Item      *tmp_having; ///< To store having when processed temporary table
   Item      *having_history; ///< Store having for explain
   ulonglong  select_options;
+  /* 
+    Bitmap of allowed types of the join caches that
+    can be used for join operations
+  */
+  uint allowed_join_cache_types;
+  bool allowed_semijoin_with_cache;
+  bool allowed_outer_join_with_cache;
+  /* Maximum level of the join caches that can be used for join operations */ 
+  uint max_allowed_join_cache_level;
   select_result *result;
   TMP_TABLE_PARAM tmp_table_param;
   MYSQL_LOCK *lock;
@@ -1571,10 +934,24 @@ public:
   ORDER *order, *group_list, *proc_param; //hold parameters of mysql_select
   COND *conds;                            // ---"---
   Item *conds_history;                    // store WHERE for explain
+  COND *outer_ref_cond;       ///<part of conds containing only outer references
   TABLE_LIST *tables_list;           ///<hold 'tables' parameter of mysql_select
   List<TABLE_LIST> *join_list;       ///< list of joined tables in reverse order
   COND_EQUAL *cond_equal;
   COND_EQUAL *having_equal;
+  /*
+    Constant codition computed during optimization, but evaluated during
+    join execution. Typically expensive conditions that should not be
+    evaluated at optimization time.
+  */
+  Item *exec_const_cond;
+  /*
+    Constant ORDER and/or GROUP expressions that contain subqueries. Such
+    expressions need to evaluated to verify that the subquery indeed
+    returns a single row. The evaluation of such expressions is delayed
+    until query execution.
+  */
+  List<Item> exec_const_order_group_cond;
   SQL_SELECT *select;                ///<created in optimisation phase
   JOIN_TAB *return_tab;              ///<used only for outer joins
   Item **ref_pointer_array; ///<used pointer reference for this select
@@ -1585,9 +962,15 @@ public:
   
   bool union_part; ///< this subselect is part of union 
   bool optimized; ///< flag to avoid double optimization in EXPLAIN
+  bool initialized; ///< flag to avoid double init_execution calls
 
-  Array<Item_in_subselect> sj_subselects;
-
+  /*
+    Additional WHERE and HAVING predicates to be considered for IN=>EXISTS
+    subquery transformation of a JOIN object.
+  */
+  Item *in_to_exists_where;
+  Item *in_to_exists_having;
+  
   /* Temporary tables used to weed-out semi-join duplicates */
   List<TABLE> sj_tmp_tables;
   /* SJM nests that are executed with SJ-Materialization strategy */
@@ -1610,7 +993,7 @@ public:
 
   JOIN(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
        select_result *result_arg)
-    :fields_list(fields_arg), sj_subselects(thd_arg->mem_root, 4)
+    :fields_list(fields_arg)
   {
     init(thd_arg, fields_arg, select_options_arg, result_arg);
   }
@@ -1619,8 +1002,9 @@ public:
        select_result *result_arg)
   {
     join_tab= join_tab_save= 0;
-    all_tables= 0;
-    tables= 0;
+    table= 0;
+    table_count= 0;
+    top_join_tab_count= 0;
     const_tables= 0;
     eliminated_tables= 0;
     join_list= 0;
@@ -1650,6 +1034,7 @@ public:
     no_order= 0;
     simple_order= 0;
     simple_group= 0;
+    need_distinct= 0;
     skip_sort_order= 0;
     need_tmp= 0;
     hidden_group_fields= 0; /*safety*/
@@ -1660,8 +1045,10 @@ public:
     ref_pointer_array_size= 0;
     zero_result_cause= 0;
     optimized= 0;
+    initialized= 0;
     cond_equal= 0;
     having_equal= 0;
+    exec_const_cond= 0;
     group_optimized_away= 0;
     no_rows_in_result_called= 0;
 
@@ -1674,21 +1061,25 @@ public:
     rollup.state= ROLLUP::STATE_NONE;
 
     no_const_tables= FALSE;
-    first_select= sub_select;
+    outer_ref_cond= 0;
+    in_to_exists_where= NULL;
+    in_to_exists_having= NULL;
   }
 
   int prepare(Item ***rref_pointer_array, TABLE_LIST *tables, uint wind_num,
 	      COND *conds, uint og_num, ORDER *order, ORDER *group,
 	      Item *having, ORDER *proc_param, SELECT_LEX *select,
 	      SELECT_LEX_UNIT *unit);
+  bool prepare_stage2();
   int optimize();
   int reinit();
+  int init_execution();
   void exec();
   int destroy();
   void restore_tmp();
   bool alloc_func_list();
   bool flatten_subqueries();
-  bool setup_subquery_materialization();
+  bool optimize_unflattened_subqueries();
   bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
 			  bool before_group_by, bool recompute= FALSE);
 
@@ -1724,8 +1115,8 @@ public:
   bool init_save_join_tab();
   bool send_row_on_empty_set()
   {
-    return (do_send_rows && tmp_table_param.sum_func_count != 0 &&
-	    !group_list && having_value != Item::COND_FALSE);
+    return (do_send_rows && implicit_grouping && !group_optimized_away &&
+            having_value != Item::COND_FALSE);
   }
   bool change_result(select_result *result);
   bool is_top_level_join() const
@@ -1736,8 +1127,10 @@ public:
   void cache_const_exprs();
   inline table_map all_tables_map()
   {
-    return (table_map(1) << tables) - 1;
+    return (table_map(1) << table_count) - 1;
   }
+  void drop_unused_derived_keys();
+  inline void eval_select_list_used_tables();
   /* 
     Return the table for which an index scan can be used to satisfy 
     the sort order needed by the ORDER BY/(implicit) GROUP BY clause 
@@ -1749,6 +1142,30 @@ public:
               NULL : join_tab+const_tables;
   }
   bool setup_subquery_caches();
+  bool shrink_join_buffers(JOIN_TAB *jt, 
+                           ulonglong curr_space,
+                           ulonglong needed_space);
+  void set_allowed_join_cache_types();
+  bool is_allowed_hash_join_access()
+  { 
+    return test(allowed_join_cache_types & JOIN_CACHE_HASHED_BIT) &&
+           max_allowed_join_cache_level > JOIN_CACHE_HASHED_BIT;
+  }
+  bool choose_subquery_plan(table_map join_tables);
+  void get_partial_cost_and_fanout(uint end_tab_idx,
+                                   table_map filter_map,
+                                   double *read_time_arg, 
+                                   double *record_count_arg);
+  void get_prefix_cost_and_fanout(uint n_tables, 
+                                  double *read_time_arg,
+                                  double *record_count_arg);
+  /* defined in opt_subselect.cc */
+  bool transform_max_min_subquery();
+  /* True if this JOIN is a subquery under an IN predicate. */
+  bool is_in_subquery()
+  {
+    return (unit->item && unit->item->is_in_predicate());
+  }
 private:
   /**
     TRUE if the query contains an aggregate function but has no GROUP
@@ -1759,6 +1176,15 @@ private:
   void cleanup_item_list(List<Item> &items) const;
 };
 
+enum enum_with_bush_roots { WITH_BUSH_ROOTS, WITHOUT_BUSH_ROOTS};
+enum enum_with_const_tables { WITH_CONST_TABLES, WITHOUT_CONST_TABLES};
+
+JOIN_TAB *first_linear_tab(JOIN *join, enum enum_with_const_tables const_tbls);
+JOIN_TAB *next_linear_tab(JOIN* join, JOIN_TAB* tab, 
+                          enum enum_with_bush_roots include_bush_roots);
+
+JOIN_TAB *first_top_level_tab(JOIN *join, enum enum_with_const_tables with_const);
+JOIN_TAB *next_top_level_tab(JOIN *join, JOIN_TAB *tab);
 
 typedef struct st_select_check {
   uint const_ref,reg_ref;
@@ -1786,7 +1212,7 @@ bool is_indexed_agg_distinct(JOIN *join, List<Item_field> *out_args);
 /* functions from opt_sum.cc */
 bool simple_pred(Item_func *func_item, Item **args, bool *inv_order);
 int opt_sum_query(THD* thd,
-                  TABLE_LIST *tables, List<Item> &all_fields, COND *conds);
+                  List<TABLE_LIST> &tables, List<Item> &all_fields, COND *conds);
 
 /* from sql_delete.cc, used by opt_range.cc */
 extern "C" int refpos_order_cmp(void* arg, const void *a,const void *b);
@@ -1798,6 +1224,7 @@ class store_key :public Sql_alloc
 public:
   bool null_key; /* TRUE <=> the value of the key has a null part */
   enum store_key_result { STORE_KEY_OK, STORE_KEY_FATAL, STORE_KEY_CONV };
+  enum Type { FIELD_STORE_KEY, ITEM_STORE_KEY, CONST_ITEM_STORE_KEY };
   store_key(THD *thd, Field *field_arg, uchar *ptr, uchar *null, uint length)
     :null_key(0), null_ptr(null), err(0)
   {
@@ -1818,6 +1245,7 @@ public:
                                         ptr, null, 1);
   }
   virtual ~store_key() {}			/** Not actually needed */
+  virtual enum Type type() const=0;
   virtual const char *name() const=0;
 
   /**
@@ -1869,15 +1297,32 @@ class store_key_field: public store_key
     {
       copy_field.set(to_field,from_field,0);
     }
-  }
+  }  
+
+  enum Type type() const { return FIELD_STORE_KEY; }
   const char *name() const { return field_name; }
 
+  void change_source_field(Item_field *fld_item)
+  {
+    copy_field.set(to_field, fld_item->field, 0);
+    field_name= fld_item->full_name();
+  }
+
  protected: 
   enum store_key_result copy_inner()
   {
     TABLE *table= copy_field.to_field->table;
     my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
                                                      table->write_set);
+
+    /* 
+      It looks like the next statement is needed only for a simplified
+      hash function over key values used now in BNLH join.
+      When the implementation of this function will be replaced for a proper
+      full version this statement probably should be removed.
+    */  
+    bzero(copy_field.to_ptr,copy_field.to_length);
+
     copy_field.do_copy(&copy_field);
     dbug_tmp_restore_column_map(table->write_set, old_map);
     null_key= to_field->is_null();
@@ -1902,6 +1347,8 @@ public:
 	       null_ptr_arg ? null_ptr_arg : item_arg->maybe_null ?
 	       &err : (uchar*) 0, length), item(item_arg), use_value(val)
   {}
+
+  enum Type type() const { return ITEM_STORE_KEY; }
   const char *name() const { return "func"; }
 
  protected:  
@@ -1911,6 +1358,15 @@ public:
     my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
                                                      table->write_set);
     int res= FALSE;
+
+    /* 
+      It looks like the next statement is needed only for a simplified
+      hash function over key values used now in BNLH join.
+      When the implementation of this function will be replaced for a proper
+      full version this statement probably should be removed.
+    */  
+    to_field->reset();
+
     if (use_value)
       item->save_val(to_field);
     else
@@ -1941,6 +1397,8 @@ public:
 		    &err : (uchar*) 0, length, item_arg, FALSE), inited(0)
   {
   }
+
+  enum Type type() const { return CONST_ITEM_STORE_KEY; }
   const char *name() const { return "const"; }
 
 protected:  
@@ -1950,6 +1408,9 @@ protected:
     if (!inited)
     {
       inited=1;
+      TABLE *table= to_field->table;
+      my_bitmap_map *old_map= dbug_tmp_use_all_columns(table,
+                                                       table->write_set);
       if ((res= item->save_in_field(to_field, 1)))
       {       
         if (!err)
@@ -1961,6 +1422,7 @@ protected:
         */
       if (!err && to_field->table->in_use->is_error())
         err= 1; /* STORE_KEY_FATAL */
+      dbug_tmp_restore_column_map(table->write_set, old_map);
     }
     null_key= to_field->is_null() || item->null_value;
     return (err > 2 ? STORE_KEY_FATAL : (store_key_result) err);
@@ -2003,6 +1465,10 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
 			bool table_cant_handle_bit_fields,
                         bool make_copy_field,
                         uint convert_blob_length);
+bool create_internal_tmp_table(TABLE *table, KEY *keyinfo, 
+                               ENGINE_COLUMNDEF *start_recinfo,
+                               ENGINE_COLUMNDEF **recinfo, 
+                               ulonglong options, my_bool big_tables);
 
 /*
   General routine to change field->ptr of a NULL-terminated array of Field
@@ -2015,21 +1481,18 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type,
 TABLE *create_virtual_tmp_table(THD *thd, List<Create_field> &field_list);
 
 int test_if_item_cache_changed(List<Cached_item> &list);
-void calc_used_field_length(THD *thd, JOIN_TAB *join_tab);
 int join_init_read_record(JOIN_TAB *tab);
+int join_read_record_no_init(JOIN_TAB *tab);
 void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key);
 inline Item * and_items(Item* cond, Item *item)
 {
   return (cond? (new Item_cond_and(cond, item)) : item);
 }
-bool choose_plan(JOIN *join,table_map join_tables);
-void get_partial_join_cost(JOIN *join, uint n_tables, double *read_time_arg,
-                           double *record_count_arg);
+bool choose_plan(JOIN *join, table_map join_tables);
 void optimize_wo_join_buffering(JOIN *join, uint first_tab, uint last_tab, 
                                 table_map last_remaining_tables, 
                                 bool first_alt, uint no_jbuf_before,
-                                double *reopt_rec_count, double *reopt_cost,
-                                double *sj_inner_fanout);
+                                double *outer_rec_count, double *reopt_cost);
 Item_equal *find_item_equal(COND_EQUAL *cond_equal, Field *field,
                             bool *inherited_fl);
 bool test_if_ref(COND *root_cond, 
@@ -2051,7 +1514,7 @@ bool const_expression_in_where(COND *cond, Item *comp_item,
 void eliminate_tables(JOIN *join);
 
 /* Index Condition Pushdown entry point function */
-void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok);
+void push_index_cond(JOIN_TAB *tab, uint keyno);
 
 /****************************************************************************
   Temporary table support for SQL Runtime
@@ -2065,7 +1528,7 @@ void push_index_cond(JOIN_TAB *tab, uint keyno, bool other_tbls_ok);
 TABLE *create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields,
 			ORDER *group, bool distinct, bool save_sum_fields,
 			ulonglong select_options, ha_rows rows_limit,
-			const char* alias);
+			const char* alias, bool do_not_open=FALSE);
 void free_tmp_table(THD *thd, TABLE *entry);
 bool create_internal_tmp_table_from_heap(THD *thd, TABLE *table,
                                          ENGINE_COLUMNDEF *start_recinfo,
@@ -2077,5 +1540,7 @@ bool create_internal_tmp_table(TABLE *table, KEY *keyinfo,
                                ulonglong options, my_bool big_tables);
 bool open_tmp_table(TABLE *table);
 void setup_tmp_table_column_bitmaps(TABLE *table, uchar *bitmaps);
+double prev_record_reads(POSITION *positions, uint idx, table_map found_ref);
+void fix_list_after_tbl_changes(SELECT_LEX *new_parent, List<TABLE_LIST> *tlist);
 
 #endif /* SQL_SELECT_INCLUDED */
diff --git a/sql/sql_show.cc b/sql/sql_show.cc
index 39e3652a1af..78503faa68c 100644
--- a/sql/sql_show.cc
+++ b/sql/sql_show.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -43,6 +44,7 @@
 #include "sp_pcontext.h"
 #include "set_var.h"
 #include "sql_trigger.h"
+#include "sql_derived.h"
 #include "sql_connect.h"
 #include "authors.h"
 #include "contributors.h"
@@ -719,7 +721,7 @@ mysqld_show_create(THD *thd, TABLE_LIST *table_list)
     bool open_error=
       open_tables(thd, &table_list, &counter,
                   MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL) ||
-                  mysql_handle_derived(thd->lex, &mysql_derived_prepare);
+                  mysql_handle_derived(thd->lex, DT_PREPARE);
     thd->pop_internal_handler();
     if (open_error && (thd->killed || thd->is_error()))
       goto exit;
@@ -775,7 +777,7 @@ mysqld_show_create(THD *thd, TABLE_LIST *table_list)
       protocol->store(table_list->schema_table->table_name,
                       system_charset_info);
     else
-      protocol->store(table_list->table->alias, system_charset_info);
+      protocol->store(table_list->table->alias.c_ptr(), system_charset_info);
   }
 
   if (table_list->view)
@@ -901,7 +903,8 @@ mysqld_list_fields(THD *thd, TABLE_LIST *table_list, const char *wild)
   DBUG_PRINT("enter",("table: %s",table_list->table_name));
 
   if (open_normal_and_derived_tables(thd, table_list,
-                                     MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL))
+                                     MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL,
+                                     DT_PREPARE | DT_CREATE))
     DBUG_VOID_RETURN;
   table= table_list->table;
 
@@ -1243,7 +1246,7 @@ int store_create_info(THD *thd, TABLE_LIST *table_list, String *packet,
   else
   {
     if (lower_case_table_names == 2)
-      alias= table->alias;
+      alias= table->alias.c_ptr();
     else
     {
       alias= share->table_name.str;
@@ -1684,7 +1687,7 @@ view_store_options(THD *thd, TABLE_LIST *table, String *buff)
 static void append_algorithm(TABLE_LIST *table, String *buff)
 {
   buff->append(STRING_WITH_LEN("ALGORITHM="));
-  switch ((int8)table->algorithm) {
+  switch ((int16)table->algorithm) {
   case VIEW_ALGORITHM_UNDEFINED:
     buff->append(STRING_WITH_LEN("UNDEFINED "));
     break;
@@ -1808,6 +1811,7 @@ public:
   uint   command;
   const char *user,*host,*db,*proc_info,*state_info;
   CSET_STRING query_string;
+  double progress;
 };
 
 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
@@ -1860,6 +1864,11 @@ void mysqld_list_processes(THD *thd,const char *user, bool verbose)
   field->maybe_null=1;
   field_list.push_back(field=new Item_empty_string("Info",max_query_length));
   field->maybe_null=1;
+  if (!thd->variables.old_mode)
+  {
+    field_list.push_back(field= new Item_float("Progress", 0.0, 3, 7));
+    field->maybe_null= 0;
+  }
   if (protocol->send_result_set_metadata(&field_list,
                             Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
     DBUG_VOID_RETURN;
@@ -1913,8 +1922,24 @@ void mysqld_list_processes(THD *thd,const char *user, bool verbose)
           thd_info->query_string=
             CSET_STRING(q, q ? length : 0, tmp->query_charset());
         }
-        mysql_mutex_unlock(&tmp->LOCK_thd_data);
+
+        /*
+          Progress report. We need to do this under a lock to ensure that all
+          is from the same stage.
+        */
+        if (tmp->progress.max_counter)
+        {
+          uint max_stage= max(tmp->progress.max_stage, 1);
+          thd_info->progress= (((tmp->progress.stage / (double) max_stage) +
+                                ((tmp->progress.counter /
+                                  (double) tmp->progress.max_counter) /
+                                 (double) max_stage)) *
+                               100.0);
+        }
+        else
+          thd_info->progress= 0.0;
         thd_info->start_time= tmp->start_time;
+        mysql_mutex_unlock(&tmp->LOCK_thd_data);
         thread_infos.append(thd_info);
       }
     }
@@ -1923,6 +1948,9 @@ void mysqld_list_processes(THD *thd,const char *user, bool verbose)
 
   thread_info *thd_info;
   time_t now= my_time(0);
+  char buff[20];                                // For progress
+  String store_buffer(buff, sizeof(buff), system_charset_info);
+
   while ((thd_info=thread_infos.get()))
   {
     protocol->prepare_for_resend();
@@ -1941,6 +1969,8 @@ void mysqld_list_processes(THD *thd,const char *user, bool verbose)
     protocol->store(thd_info->state_info, system_charset_info);
     protocol->store(thd_info->query_string.str(),
                     thd_info->query_string.charset());
+    if (!thd->variables.old_mode)
+      protocol->store(thd_info->progress, 3, &store_buffer);
     if (protocol->write())
       break; /* purecov: inspected */
   }
@@ -1953,7 +1983,7 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
   TABLE *table= tables->table;
   CHARSET_INFO *cs= system_charset_info;
   char *user;
-  ulonglong unow= my_micro_time();
+  my_hrtime_t unow= my_hrtime();
   DBUG_ENTER("fill_process_list");
 
   user= thd->security_ctx->master_access & PROCESS_ACL ?
@@ -1971,6 +2001,7 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
       Security_context *tmp_sctx= tmp->security_ctx;
       struct st_my_thread_var *mysys_var;
       const char *val;
+      ulonglong max_counter;
 
       if ((!tmp->vio_ok() && !tmp->system_thread) ||
           (user && (!tmp_sctx->user || strcmp(tmp_sctx->user, user))))
@@ -2012,8 +2043,10 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
         table->field[4]->store(command_name[tmp->command].str,
                                command_name[tmp->command].length, cs);
       /* MYSQL_TIME */
-      const ulonglong utime= tmp->start_utime ? unow - tmp->start_utime : 0;
-      table->field[5]->store(utime / 1000000, TRUE);
+      const ulonglong utime= (tmp->start_time ?
+                              (unow.val - tmp->start_time * HRTIME_RESOLUTION -
+                               tmp->start_time_sec_part) : 0);
+      table->field[5]->store(utime / HRTIME_RESOLUTION, TRUE);
       /* STATE */
       if ((val= thread_state_info(tmp)))
       {
@@ -2025,6 +2058,9 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
         mysql_mutex_unlock(&mysys_var->mutex);
       mysql_mutex_unlock(&tmp->LOCK_thd_data);
 
+      /* TIME_MS */
+      table->field[8]->store((double)(utime / (HRTIME_RESOLUTION / 1000.0)));
+
       /* INFO */
       /* Lock THD mutex that protects its data when looking at it. */
       mysql_mutex_lock(&tmp->LOCK_thd_data);
@@ -2035,10 +2071,19 @@ int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
                                    tmp->query_length()), cs);
         table->field[7]->set_notnull();
       }
-      mysql_mutex_unlock(&tmp->LOCK_thd_data);
 
-      /* TIME_MS */
-      table->field[8]->store((double)(utime / 1000.0));
+      /*
+        Progress report. We need to do this under a lock to ensure that all
+        is from the same stage.
+      */
+      if ((max_counter= tmp->progress.max_counter))
+      {
+        table->field[9]->store((longlong) tmp->progress.stage + 1, 1);
+        table->field[10]->store((longlong) tmp->progress.max_stage, 1);
+        table->field[11]->store((double) tmp->progress.counter /
+                                (double) max_counter*100.0);
+      }
+      mysql_mutex_unlock(&tmp->LOCK_thd_data);
 
       if (schema_table_store_record(thd, table))
       {
@@ -2765,7 +2810,7 @@ typedef struct st_lookup_field_values
 bool schema_table_store_record(THD *thd, TABLE *table)
 {
   int error;
-  if ((error= table->file->ha_write_row(table->record[0])))
+  if ((error= table->file->ha_write_tmp_row(table->record[0])))
   {
     TMP_TABLE_PARAM *param= table->pos_in_table_list->schema_table_param;
     if (create_internal_tmp_table_from_heap(thd, table, param->start_recinfo, 
@@ -2957,7 +3002,7 @@ bool uses_only_table_name_fields(Item *item, TABLE_LIST *table)
   else if (item->type() == Item::REF_ITEM)
     return uses_only_table_name_fields(item->real_item(), table);
 
-  if (item->type() == Item::SUBSELECT_ITEM && !item->const_item())
+  if (item->real_type() == Item::SUBSELECT_ITEM && !item->const_item())
     return 0;
 
   return 1;
@@ -3472,7 +3517,8 @@ fill_schema_table_by_open(THD *thd, bool is_show_fields_or_keys,
                                          (MYSQL_OPEN_IGNORE_FLUSH |
                                           MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL |
                                           (can_deadlock ?
-                                           MYSQL_OPEN_FAIL_ON_MDL_CONFLICT : 0)));
+                                           MYSQL_OPEN_FAIL_ON_MDL_CONFLICT : 0)),
+                                         DT_PREPARE | DT_CREATE);
   /*
     Restore old value of sql_command back as it is being looked at in
     process_table() function.
@@ -3863,7 +3909,6 @@ static int fill_schema_table_from_frm(THD *thd, TABLE_LIST *tables,
     res= schema_table->process_table(thd, &table_list, table,
                                      res, db_name, table_name);
     free_root(&tbl.mem_root, MYF(0));
-    my_free((void *) tbl.alias);
   }
 
 end_share:
@@ -4462,21 +4507,21 @@ static int get_schema_tables_record(THD *thd, TABLE_LIST *tables,
       {
         thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                                   (my_time_t) file->stats.create_time);
-        table->field[14]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+        table->field[14]->store_time(&time);
         table->field[14]->set_notnull();
       }
       if (file->stats.update_time)
       {
         thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                                   (my_time_t) file->stats.update_time);
-        table->field[15]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+        table->field[15]->store_time(&time);
         table->field[15]->set_notnull();
       }
       if (file->stats.check_time)
       {
         thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                                   (my_time_t) file->stats.check_time);
-        table->field[16]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+        table->field[16]->store_time(&time);
         table->field[16]->set_notnull();
       }
       if (file->ha_table_flags() & (HA_HAS_OLD_CHECKSUM | HA_HAS_NEW_CHECKSUM))
@@ -4522,8 +4567,8 @@ err:
   @return         void
 */
 
-void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
-                       uint offset)
+static void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
+                              uint offset)
 {
   bool is_blob;
   int decimals, field_length;
@@ -4533,8 +4578,8 @@ void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
 
   field->sql_type(column_type);
   /* DTD_IDENTIFIER column */
-  table->field[offset + 7]->store(column_type.ptr(), column_type.length(), cs);
-  table->field[offset + 7]->set_notnull();
+  table->field[offset + 8]->store(column_type.ptr(), column_type.length(), cs);
+  table->field[offset + 8]->set_notnull();
   /*
     DATA_TYPE column:
     MySQL column type has the following format:
@@ -4547,7 +4592,7 @@ void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
       if there is no dimention part then check the presence of
       [unsigned] [zerofill] attributes and cut them of if exist.
     */
-    tmp_buff= strchr(column_type.ptr(), ' ');
+    tmp_buff= strchr(column_type.c_ptr_safe(), ' ');
   table->field[offset]->store(column_type.ptr(),
                               (tmp_buff ? tmp_buff - column_type.ptr() :
                                column_type.length()), cs);
@@ -4576,6 +4621,7 @@ void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
     They are set to -1 if they should not be set (we should return NULL)
   */
 
+  field_length= -1;
   decimals= field->decimals();
   switch (field->type()) {
   case MYSQL_TYPE_NEWDECIMAL:
@@ -4604,8 +4650,14 @@ void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
     if (decimals == NOT_FIXED_DEC)
       decimals= -1;                           // return NULL
     break;
+  case MYSQL_TYPE_TIME:
+  case MYSQL_TYPE_TIMESTAMP:
+  case MYSQL_TYPE_DATETIME:
+    /* DATETIME_PRECISION column */
+    table->field[offset + 5]->store((longlong) field->decimals(), TRUE);
+    table->field[offset + 5]->set_notnull();
+    break;
   default:
-    field_length= decimals= -1;
     break;
   }
 
@@ -4614,23 +4666,24 @@ void store_column_type(TABLE *table, Field *field, CHARSET_INFO *cs,
   {
     table->field[offset + 3]->store((longlong) field_length, TRUE);
     table->field[offset + 3]->set_notnull();
-  }
-  /* NUMERIC_SCALE column */
-  if (decimals >= 0)
-  {
-    table->field[offset + 4]->store((longlong) decimals, TRUE);
-    table->field[offset + 4]->set_notnull();
+
+    /* NUMERIC_SCALE column */
+    if (decimals >= 0)
+    {
+      table->field[offset + 4]->store((longlong) decimals, TRUE);
+      table->field[offset + 4]->set_notnull();
+    }
   }
   if (field->has_charset())
   {
     /* CHARACTER_SET_NAME column*/
     tmp_buff= field->charset()->csname;
-    table->field[offset + 5]->store(tmp_buff, strlen(tmp_buff), cs);
-    table->field[offset + 5]->set_notnull();
-    /* COLLATION_NAME column */
-    tmp_buff= field->charset()->name;
     table->field[offset + 6]->store(tmp_buff, strlen(tmp_buff), cs);
     table->field[offset + 6]->set_notnull();
+    /* COLLATION_NAME column */
+    tmp_buff= field->charset()->name;
+    table->field[offset + 7]->store(tmp_buff, strlen(tmp_buff), cs);
+    table->field[offset + 7]->set_notnull();
   }
 }
 
@@ -4706,7 +4759,7 @@ static int get_schema_column_record(THD *thd, TABLE_LIST *tables,
         end=strmov(end,grant_types.type_names[bitnr]);
       }
     }
-    table->field[17]->store(tmp+1,end == tmp ? 0 : (uint) (end-tmp-1), cs);
+    table->field[18]->store(tmp+1,end == tmp ? 0 : (uint) (end-tmp-1), cs);
 
 #endif
     table->field[0]->store(STRING_WITH_LEN("def"), cs);
@@ -4715,8 +4768,6 @@ static int get_schema_column_record(THD *thd, TABLE_LIST *tables,
     table->field[3]->store(field->field_name, strlen(field->field_name),
                            cs);
     table->field[4]->store((longlong) count, TRUE);
-    field->sql_type(type);
-    table->field[14]->store(type.ptr(), type.length(), cs);
 
     if (get_field_default_value(thd, timestamp_field, field, &type, 0))
     {
@@ -4730,19 +4781,23 @@ static int get_schema_column_record(THD *thd, TABLE_LIST *tables,
     pos=(uchar*) ((field->flags & PRI_KEY_FLAG) ? "PRI" :
                  (field->flags & UNIQUE_KEY_FLAG) ? "UNI" :
                  (field->flags & MULTIPLE_KEY_FLAG) ? "MUL":"");
-    table->field[15]->store((const char*) pos,
+    table->field[16]->store((const char*) pos,
                             strlen((const char*) pos), cs);
 
     if (field->unireg_check == Field::NEXT_NUMBER)
-      table->field[16]->store(STRING_WITH_LEN("auto_increment"), cs);
+      table->field[17]->store(STRING_WITH_LEN("auto_increment"), cs);
     if (timestamp_field == field &&
         field->unireg_check != Field::TIMESTAMP_DN_FIELD)
-      table->field[16]->store(STRING_WITH_LEN("on update CURRENT_TIMESTAMP"),
+      table->field[17]->store(STRING_WITH_LEN("on update CURRENT_TIMESTAMP"),
                               cs);
     if (field->vcol_info)
-      table->field[16]->store(STRING_WITH_LEN("VIRTUAL"), cs);
-
-    table->field[18]->store(field->comment.str, field->comment.length, cs);
+    {
+      if (field->stored_in_db)
+        table->field[17]->store(STRING_WITH_LEN("PERSISTENT"), cs);
+      else
+        table->field[17]->store(STRING_WITH_LEN("VIRTUAL"), cs);
+    }
+    table->field[19]->store(field->comment.str, field->comment.length, cs);
     if (schema_table_store_record(thd, table))
       DBUG_RETURN(1);
   }
@@ -5030,7 +5085,7 @@ bool store_schema_params(THD *thd, TABLE *table, TABLE *proc_table,
       table->field[3]->store((longlong) 0, TRUE);
       get_field(thd->mem_root, proc_table->field[MYSQL_PROC_MYSQL_TYPE],
                 &tmp_string);
-      table->field[14]->store(tmp_string.ptr(), tmp_string.length(), cs);
+      table->field[15]->store(tmp_string.ptr(), tmp_string.length(), cs);
       field_def= &sp->m_return_field_def;
       field= make_field(&share, (uchar*) 0, field_def->length,
                         (uchar*) "", 0, field_def->pack_flag,
@@ -5083,7 +5138,7 @@ bool store_schema_params(THD *thd, TABLE *table, TABLE *proc_table,
       table->field[5]->set_notnull();
       get_field(thd->mem_root, proc_table->field[MYSQL_PROC_MYSQL_TYPE],
                 &tmp_string);
-      table->field[14]->store(tmp_string.ptr(), tmp_string.length(), cs);
+      table->field[15]->store(tmp_string.ptr(), tmp_string.length(), cs);
 
       field= make_field(&share, (uchar*) 0, field_def->length,
                         (uchar*) "", 0, field_def->pack_flag,
@@ -5200,40 +5255,40 @@ bool store_schema_proc(THD *thd, TABLE *table, TABLE *proc_table,
 
       if (full_access)
       {
-        copy_field_as_string(table->field[14],
+        copy_field_as_string(table->field[15],
                              proc_table->field[MYSQL_PROC_FIELD_BODY_UTF8]);
-        table->field[14]->set_notnull();
+        table->field[15]->set_notnull();
       }
-      table->field[13]->store(STRING_WITH_LEN("SQL"), cs);
-      table->field[17]->store(STRING_WITH_LEN("SQL"), cs);
-      copy_field_as_string(table->field[18],
+      table->field[14]->store(STRING_WITH_LEN("SQL"), cs);
+      table->field[18]->store(STRING_WITH_LEN("SQL"), cs);
+      copy_field_as_string(table->field[19],
                            proc_table->field[MYSQL_PROC_FIELD_DETERMINISTIC]);
-      table->field[19]->store(sp_data_access_name[enum_idx].str, 
+      table->field[20]->store(sp_data_access_name[enum_idx].str, 
                               sp_data_access_name[enum_idx].length , cs);
-      copy_field_as_string(table->field[21],
+      copy_field_as_string(table->field[22],
                            proc_table->field[MYSQL_PROC_FIELD_SECURITY_TYPE]);
 
       bzero((char *)&time, sizeof(time));
       ((Field_timestamp *) proc_table->field[MYSQL_PROC_FIELD_CREATED])->
         get_time(&time);
-      table->field[22]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+      table->field[23]->store_time(&time);
       bzero((char *)&time, sizeof(time));
       ((Field_timestamp *) proc_table->field[MYSQL_PROC_FIELD_MODIFIED])->
         get_time(&time);
-      table->field[23]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
-      copy_field_as_string(table->field[24],
-                           proc_table->field[MYSQL_PROC_FIELD_SQL_MODE]);
+      table->field[24]->store_time(&time);
       copy_field_as_string(table->field[25],
+                           proc_table->field[MYSQL_PROC_FIELD_SQL_MODE]);
+      copy_field_as_string(table->field[26],
                            proc_table->field[MYSQL_PROC_FIELD_COMMENT]);
 
-      table->field[26]->store(definer.ptr(), definer.length(), cs);
-      copy_field_as_string(table->field[27],
+      table->field[27]->store(definer.ptr(), definer.length(), cs);
+      copy_field_as_string(table->field[28],
                            proc_table->
                            field[MYSQL_PROC_FIELD_CHARACTER_SET_CLIENT]);
-      copy_field_as_string(table->field[28],
+      copy_field_as_string(table->field[29],
                            proc_table->
                            field[MYSQL_PROC_FIELD_COLLATION_CONNECTION]);
-      copy_field_as_string(table->field[29],
+      copy_field_as_string(table->field[30],
 			   proc_table->field[MYSQL_PROC_FIELD_DB_COLLATION]);
 
       return schema_table_store_record(thd, table);
@@ -5935,21 +5990,21 @@ static void store_schema_partitions_record(THD *thd, TABLE *schema_table,
   {
     thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                               (my_time_t)stat_info.create_time);
-    table->field[18]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    table->field[18]->store_time(&time);
     table->field[18]->set_notnull();
   }
   if (stat_info.update_time)
   {
     thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                               (my_time_t)stat_info.update_time);
-    table->field[19]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    table->field[19]->store_time(&time);
     table->field[19]->set_notnull();
   }
   if (stat_info.check_time)
   {
     thd->variables.time_zone->gmt_sec_to_TIME(&time,
                                               (my_time_t)stat_info.check_time);
-    table->field[20]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    table->field[20]->store_time(&time);
     table->field[20]->set_notnull();
   }
   if (file->ha_table_flags() & (HA_HAS_OLD_CHECKSUM | HA_HAS_NEW_CHECKSUM))
@@ -6302,7 +6357,7 @@ copy_event_to_schema_table(THD *thd, TABLE *sch_table, TABLE *event_table)
 
   if (et.load_from_row(thd, event_table))
   {
-    my_error(ER_CANNOT_LOAD_FROM_TABLE, MYF(0), event_table->alias);
+    my_error(ER_CANNOT_LOAD_FROM_TABLE, MYF(0), event_table->alias.c_ptr());
     DBUG_RETURN(1);
   }
 
@@ -6364,15 +6419,13 @@ copy_event_to_schema_table(THD *thd, TABLE *sch_table, TABLE *event_table)
     /* starts & ends . STARTS is always set - see sql_yacc.yy */
     et.time_zone->gmt_sec_to_TIME(&time, et.starts);
     sch_table->field[ISE_STARTS]->set_notnull();
-    sch_table->field[ISE_STARTS]->
-                                store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    sch_table->field[ISE_STARTS]->store_time(&time);
 
     if (!et.ends_null)
     {
       et.time_zone->gmt_sec_to_TIME(&time, et.ends);
       sch_table->field[ISE_ENDS]->set_notnull();
-      sch_table->field[ISE_ENDS]->
-                                store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+      sch_table->field[ISE_ENDS]->store_time(&time);
     }
   }
   else
@@ -6382,8 +6435,7 @@ copy_event_to_schema_table(THD *thd, TABLE *sch_table, TABLE *event_table)
 
     et.time_zone->gmt_sec_to_TIME(&time, et.execute_at);
     sch_table->field[ISE_EXECUTE_AT]->set_notnull();
-    sch_table->field[ISE_EXECUTE_AT]->
-                          store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    sch_table->field[ISE_EXECUTE_AT]->store_time(&time);
   }
 
   /* status */
@@ -6413,21 +6465,19 @@ copy_event_to_schema_table(THD *thd, TABLE *sch_table, TABLE *event_table)
     sch_table->field[ISE_ON_COMPLETION]->
                                 store(STRING_WITH_LEN("PRESERVE"), scs);
 
-  number_to_datetime(et.created, &time, 0, &not_used);
+  number_to_datetime(et.created, 0, &time, 0, &not_used);
   DBUG_ASSERT(not_used==0);
-  sch_table->field[ISE_CREATED]->store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+  sch_table->field[ISE_CREATED]->store_time(&time);
 
-  number_to_datetime(et.modified, &time, 0, &not_used);
+  number_to_datetime(et.modified, 0, &time, 0, &not_used);
   DBUG_ASSERT(not_used==0);
-  sch_table->field[ISE_LAST_ALTERED]->
-                                store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+  sch_table->field[ISE_LAST_ALTERED]->store_time(&time);
 
   if (et.last_executed)
   {
     et.time_zone->gmt_sec_to_TIME(&time, et.last_executed);
     sch_table->field[ISE_LAST_EXECUTED]->set_notnull();
-    sch_table->field[ISE_LAST_EXECUTED]->
-                       store_time(&time, MYSQL_TIMESTAMP_DATETIME);
+    sch_table->field[ISE_LAST_EXECUTED]->store_time(&time);
   }
 
   sch_table->field[ISE_EVENT_COMMENT]->
@@ -6834,14 +6884,23 @@ TABLE *create_schema_table(THD *thd, TABLE_LIST *table_list)
       item->unsigned_flag= (fields_info->field_flags & MY_I_S_UNSIGNED);
       break;
     case MYSQL_TYPE_DATE:
+      if (!(item=new Item_return_date_time(fields_info->field_name,
+                                           MAX_DATE_WIDTH,
+                                           fields_info->field_type)))
+        DBUG_RETURN(0);
+      break;
     case MYSQL_TYPE_TIME:
+      if (!(item=new Item_return_date_time(fields_info->field_name,
+                                           MAX_TIME_FULL_WIDTH,
+                                           fields_info->field_type)))
+        DBUG_RETURN(0);
+      break;
     case MYSQL_TYPE_TIMESTAMP:
     case MYSQL_TYPE_DATETIME:
       if (!(item=new Item_return_date_time(fields_info->field_name,
+                                           MAX_DATETIME_WIDTH,
                                            fields_info->field_type)))
-      {
         DBUG_RETURN(0);
-      }
       break;
     case MYSQL_TYPE_FLOAT:
     case MYSQL_TYPE_DOUBLE:
@@ -7027,7 +7086,7 @@ int make_table_names_old_format(THD *thd, ST_SCHEMA_TABLE *schema_table)
 
 int make_columns_old_format(THD *thd, ST_SCHEMA_TABLE *schema_table)
 {
-  int fields_arr[]= {3, 14, 13, 6, 15, 5, 16, 17, 18, -1};
+  int fields_arr[]= {3, 15, 14, 6, 16, 5, 17, 18, 19, -1};
   int *field_num= fields_arr;
   ST_FIELD_INFO *field_info;
   Name_resolution_context *context= &thd->lex->select_lex.context;
@@ -7035,9 +7094,9 @@ int make_columns_old_format(THD *thd, ST_SCHEMA_TABLE *schema_table)
   for (; *field_num >= 0; field_num++)
   {
     field_info= &schema_table->fields_info[*field_num];
-    if (!thd->lex->verbose && (*field_num == 13 ||
-                               *field_num == 17 ||
-                               *field_num == 18))
+    if (!thd->lex->verbose && (*field_num == 14 ||
+                               *field_num == 18 ||
+                               *field_num == 19))
       continue;
     Item_field *field= new Item_field(context,
                                       NullS, NullS, field_info->field_name);
@@ -7081,7 +7140,7 @@ int make_character_sets_old_format(THD *thd, ST_SCHEMA_TABLE *schema_table)
 
 int make_proc_old_format(THD *thd, ST_SCHEMA_TABLE *schema_table)
 {
-  int fields_arr[]= {2, 3, 4, 26, 23, 22, 21, 25, 27, 28, 29, -1};
+  int fields_arr[]= {2, 3, 4, 27, 24, 23, 22, 26, 28, 29, 30, -1};
   int *field_num= fields_arr;
   ST_FIELD_INFO *field_info;
   Name_resolution_context *context= &thd->lex->select_lex.context;
@@ -7328,13 +7387,14 @@ static bool do_fill_table(THD *thd,
 bool get_schema_tables_result(JOIN *join,
                               enum enum_schema_table_state executed_place)
 {
-  JOIN_TAB *tmp_join_tab= join->join_tab+join->tables;
   THD *thd= join->thd;
   LEX *lex= thd->lex;
   bool result= 0;
   DBUG_ENTER("get_schema_tables_result");
 
-  for (JOIN_TAB *tab= join->join_tab; tab < tmp_join_tab; tab++)
+  for (JOIN_TAB *tab= first_linear_tab(join, WITH_CONST_TABLES); 
+       tab; 
+       tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
   {
     if (!tab->table || !tab->table->pos_in_table_list)
       break;
@@ -7592,6 +7652,8 @@ ST_FIELD_INFO columns_fields_info[]=
    0, (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, OPEN_FRM_ONLY},
   {"NUMERIC_SCALE", MY_INT64_NUM_DECIMAL_DIGITS , MYSQL_TYPE_LONGLONG,
    0, (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, OPEN_FRM_ONLY},
+  {"DATETIME_PRECISION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
+   0, (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, OPEN_FRM_ONLY},
   {"CHARACTER_SET_NAME", MY_CS_NAME_SIZE, MYSQL_TYPE_STRING, 0, 1, 0,
    OPEN_FRM_ONLY},
   {"COLLATION_NAME", MY_CS_NAME_SIZE, MYSQL_TYPE_STRING, 0, 1, "Collation",
@@ -7708,6 +7770,8 @@ ST_FIELD_INFO proc_fields_info[]=
   {"CHARACTER_OCTET_LENGTH", 21 , MYSQL_TYPE_LONG, 0, 1, 0, SKIP_OPEN_TABLE},
   {"NUMERIC_PRECISION", 21 , MYSQL_TYPE_LONG, 0, 1, 0, SKIP_OPEN_TABLE},
   {"NUMERIC_SCALE", 21 , MYSQL_TYPE_LONG, 0, 1, 0, SKIP_OPEN_TABLE},
+  {"DATETIME_PRECISION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
+   0, (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, OPEN_FRM_ONLY},
   {"CHARACTER_SET_NAME", 64, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
   {"COLLATION_NAME", 64, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
   {"DTD_IDENTIFIER", 65535, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
@@ -7995,6 +8059,10 @@ ST_FIELD_INFO processlist_fields_info[]=
    SKIP_OPEN_TABLE},
   {"TIME_MS", 100 * (MY_INT64_NUM_DECIMAL_DIGITS + 1) + 3, MYSQL_TYPE_DECIMAL,
    0, 0, "Time_ms", SKIP_OPEN_TABLE},
+  {"STAGE", 2, MYSQL_TYPE_TINY,  0, 0, "Stage", SKIP_OPEN_TABLE},
+  {"MAX_STAGE", 2, MYSQL_TYPE_TINY,  0, 0, "Max_stage", SKIP_OPEN_TABLE},
+  {"PROGRESS", 703, MYSQL_TYPE_DECIMAL,  0, 0, "Progress",
+   SKIP_OPEN_TABLE},
   {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE}
 };
 
@@ -8122,6 +8190,8 @@ ST_FIELD_INFO parameters_fields_info[]=
   {"CHARACTER_OCTET_LENGTH", 21 , MYSQL_TYPE_LONG, 0, 1, 0, OPEN_FULL_TABLE},
   {"NUMERIC_PRECISION", 21 , MYSQL_TYPE_LONG, 0, 1, 0, OPEN_FULL_TABLE},
   {"NUMERIC_SCALE", 21 , MYSQL_TYPE_LONG, 0, 1, 0, OPEN_FULL_TABLE},
+  {"DATETIME_PRECISION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG,
+   0, (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, OPEN_FRM_ONLY},
   {"CHARACTER_SET_NAME", 64, MYSQL_TYPE_STRING, 0, 1, 0, OPEN_FULL_TABLE},
   {"COLLATION_NAME", 64, MYSQL_TYPE_STRING, 0, 1, 0, OPEN_FULL_TABLE},
   {"DTD_IDENTIFIER", 65535, MYSQL_TYPE_STRING, 0, 0, 0, OPEN_FULL_TABLE},
diff --git a/sql/sql_sort.h b/sql/sql_sort.h
index 98a45f14a36..4cb2f30d9de 100644
--- a/sql/sql_sort.h
+++ b/sql/sql_sort.h
@@ -72,6 +72,7 @@ typedef struct st_sort_param {
   uint addon_length;        /* Length of added packed fields */
   uint res_length;          /* Length of records in final sorted file/buffer */
   uint keys;				/* Max keys / buffer */
+  uint min_dupl_count;
   ha_rows max_rows,examined_rows;
   TABLE *sort_form;			/* For quicker make_sortkey */
   SORT_FIELD *local_sortorder;
@@ -95,6 +96,10 @@ int merge_buffers(SORTPARAM *param,IO_CACHE *from_file,
 		  IO_CACHE *to_file, uchar *sort_buffer,
 		  BUFFPEK *lastbuff,BUFFPEK *Fb,
 		  BUFFPEK *Tb,int flag);
+int merge_index(SORTPARAM *param, uchar *sort_buffer,
+		BUFFPEK *buffpek, uint maxbuffer,
+		IO_CACHE *tempfile, IO_CACHE *outfile);
+
 void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length);
 
 #endif /* SQL_SORT_INCLUDED */
diff --git a/sql/sql_string.cc b/sql/sql_string.cc
index f6025390cb2..e615bf7d4df 100644
--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -386,7 +386,7 @@ bool String::append(const String &s)
 {
   if (s.length())
   {
-    if (realloc(str_length+s.length()))
+    if (realloc_with_extra_if_needed(str_length+s.length()))
       return TRUE;
     memcpy(Ptr+str_length,s.ptr(),s.length());
     str_length+=s.length();
@@ -411,7 +411,7 @@ bool String::append(const char *s,uint32 arg_length)
   {
     uint32 add_length=arg_length * str_charset->mbmaxlen;
     uint dummy_errors;
-    if (realloc(str_length+ add_length))
+    if (realloc_with_extra_if_needed(str_length+ add_length))
       return TRUE;
     str_length+= copy_and_convert(Ptr+str_length, add_length, str_charset,
 				  s, arg_length, &my_charset_latin1,
@@ -422,7 +422,7 @@ bool String::append(const char *s,uint32 arg_length)
   /*
     For an ASCII compatinble string we can just append.
   */
-  if (realloc(str_length+arg_length))
+  if (realloc_with_extra_if_needed(str_length+arg_length))
     return TRUE;
   memcpy(Ptr+str_length,s,arg_length);
   str_length+=arg_length;
@@ -477,14 +477,14 @@ bool String::append(const char *s,uint32 arg_length, CHARSET_INFO *cs)
 
     add_length= arg_length / cs->mbminlen * str_charset->mbmaxlen;
     uint dummy_errors;
-    if (realloc(str_length + add_length)) 
+    if (realloc_with_extra_if_needed(str_length + add_length)) 
       return TRUE;
     str_length+= copy_and_convert(Ptr+str_length, add_length, str_charset,
 				  s, arg_length, cs, &dummy_errors);
   }
   else
   {
-    if (realloc(str_length + arg_length)) 
+    if (realloc_with_extra_if_needed(str_length + arg_length)) 
       return TRUE;
     memcpy(Ptr + str_length, s, arg_length);
     str_length+= arg_length;
@@ -494,7 +494,7 @@ bool String::append(const char *s,uint32 arg_length, CHARSET_INFO *cs)
 
 bool String::append(IO_CACHE* file, uint32 arg_length)
 {
-  if (realloc(str_length+arg_length))
+  if (realloc_with_extra_if_needed(str_length+arg_length))
     return TRUE;
   if (my_b_read(file, (uchar*) Ptr + str_length, arg_length))
   {
@@ -510,7 +510,7 @@ bool String::append_with_prefill(const char *s,uint32 arg_length,
 {
   int t_length= arg_length > full_length ? arg_length : full_length;
 
-  if (realloc(str_length + t_length))
+  if (realloc_with_extra_if_needed(str_length + t_length))
     return TRUE;
   t_length= full_length - arg_length;
   if (t_length > 0)
@@ -527,11 +527,11 @@ uint32 String::numchars()
   return str_charset->cset->numchars(str_charset, Ptr, Ptr+str_length);
 }
 
-int String::charpos(int i,uint32 offset)
+int String::charpos(longlong i,uint32 offset)
 {
   if (i <= 0)
-    return i;
-  return str_charset->cset->charpos(str_charset,Ptr+offset,Ptr+str_length,i);
+    return (int)i;
+  return (int)str_charset->cset->charpos(str_charset,Ptr+offset,Ptr+str_length,(size_t)i);
 }
 
 int String::strstr(const String &s,uint32 offset)
@@ -619,7 +619,7 @@ bool String::replace(uint32 offset,uint32 arg_length,
     {
       if (diff)
       {
-	if (realloc(str_length+(uint32) diff))
+	if (realloc_with_extra_if_needed(str_length+(uint32) diff))
 	  return TRUE;
 	bmove_upp((uchar*) Ptr+str_length+diff, (uchar*) Ptr+str_length,
 		  str_length-offset-arg_length);
diff --git a/sql/sql_string.h b/sql/sql_string.h
index e5e7bfb1e0c..71ad0960924 100644
--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -57,23 +57,24 @@ uint convert_to_printable(char *to, size_t to_len,
 class String
 {
   char *Ptr;
-  uint32 str_length,Alloced_length;
+  uint32 str_length,Alloced_length, extra_alloc;
   bool alloced;
   CHARSET_INFO *str_charset;
 public:
   String()
   { 
-    Ptr=0; str_length=Alloced_length=0; alloced=0; 
+    Ptr=0; str_length=Alloced_length=extra_alloc=0; alloced=0; 
     str_charset= &my_charset_bin; 
   }
   String(uint32 length_arg)
   { 
-    alloced=0; Alloced_length=0; (void) real_alloc(length_arg); 
+    alloced=0; Alloced_length= extra_alloc= 0; (void) real_alloc(length_arg); 
     str_charset= &my_charset_bin;
   }
   String(const char *str, CHARSET_INFO *cs)
   { 
-    Ptr=(char*) str; str_length=(uint) strlen(str); Alloced_length=0; alloced=0;
+    Ptr=(char*) str; str_length= (uint32) strlen(str);
+    Alloced_length= extra_alloc= 0; alloced=0;
     str_charset=cs;
   }
   /*
@@ -83,18 +84,18 @@ public:
   */
   String(const char *str,uint32 len, CHARSET_INFO *cs)
   { 
-    Ptr=(char*) str; str_length=len; Alloced_length=0; alloced=0;
+    Ptr=(char*) str; str_length=len; Alloced_length= extra_alloc=0; alloced=0;
     str_charset=cs;
   }
   String(char *str,uint32 len, CHARSET_INFO *cs)
   { 
-    Ptr=(char*) str; Alloced_length=str_length=len; alloced=0;
+    Ptr=(char*) str; Alloced_length=str_length=len; extra_alloc= 0; alloced=0;
     str_charset=cs;
   }
   String(const String &str)
   { 
     Ptr=str.Ptr ; str_length=str.str_length ;
-    Alloced_length=str.Alloced_length; alloced=0; 
+    Alloced_length=str.Alloced_length; extra_alloc= 0; alloced=0; 
     str_charset=str.str_charset;
   }
   static void *operator new(size_t size, MEM_ROOT *mem_root) throw ()
@@ -114,8 +115,10 @@ public:
   inline CHARSET_INFO *charset() const { return str_charset; }
   inline uint32 length() const { return str_length;}
   inline uint32 alloced_length() const { return Alloced_length;}
+  inline uint32 extra_allocation() const { return extra_alloc;}
   inline char& operator [] (uint32 i) const { return Ptr[i]; }
   inline void length(uint32 len) { str_length=len ; }
+  inline void extra_allocation(uint32 len) { extra_alloc= len; }
   inline bool is_empty() const { return (str_length == 0); }
   inline void mark_as_const() { Alloced_length= 0;}
   inline const char *ptr() const { return Ptr; }
@@ -152,11 +155,9 @@ public:
   {
     DBUG_ASSERT(&str != this);
     free();
-    Ptr=(char*) str.ptr()+offset; str_length=arg_length; alloced=0;
+    Ptr=(char*) str.ptr()+offset; str_length=arg_length;
     if (str.Alloced_length)
       Alloced_length=str.Alloced_length-offset;
-    else
-      Alloced_length=0;
     str_charset=str.str_charset;
   }
 
@@ -172,13 +173,13 @@ public:
   inline void set(char *str,uint32 arg_length, CHARSET_INFO *cs)
   {
     free();
-    Ptr=(char*) str; str_length=Alloced_length=arg_length ; alloced=0;
+    Ptr=(char*) str; str_length=Alloced_length=arg_length;
     str_charset=cs;
   }
   inline void set(const char *str,uint32 arg_length, CHARSET_INFO *cs)
   {
     free();
-    Ptr=(char*) str; str_length=arg_length; Alloced_length=0 ; alloced=0;
+    Ptr=(char*) str; str_length=arg_length;
     str_charset=cs;
   }
   bool set_ascii(const char *str, uint32 arg_length);
@@ -197,6 +198,18 @@ public:
   { return set_int((longlong)num, true, cs); }
   bool set_real(double num,uint decimals, CHARSET_INFO *cs);
 
+  /* Move handling of buffer from some other object to String */
+  void reassociate(char *ptr, uint32 length, uint32 alloced_length,
+                   CHARSET_INFO *cs)
+  { 
+    free();
+    Ptr= ptr;
+    str_length= length;
+    Alloced_length= alloced_length;
+    str_charset= cs;
+    alloced= ptr != 0;
+  }
+
   /*
     PMG 2004.11.12
     This is a method that works the same as perl's "chop". It simply
@@ -229,11 +242,11 @@ public:
     if (alloced)
     {
       alloced=0;
-      Alloced_length=0;
       my_free(Ptr);
-      Ptr=0;
-      str_length=0;				/* Safety */
     }
+    Alloced_length= extra_alloc= 0;
+    Ptr=0;
+    str_length=0;				/* Safety */
   }
   inline bool alloc(uint32 arg_length)
   {
@@ -243,9 +256,21 @@ public:
   }
   bool real_alloc(uint32 arg_length);			// Empties old string
   bool realloc(uint32 arg_length);
-  inline void shrink(uint32 arg_length)		// Shrink buffer
+  bool realloc_with_extra(uint32 arg_length)
+  {
+    if (extra_alloc < 4096)
+      extra_alloc= extra_alloc*2+128;
+    return realloc(arg_length + extra_alloc);
+  }
+  bool realloc_with_extra_if_needed(uint32 arg_length)
   {
     if (arg_length < Alloced_length)
+      return 0;
+    return realloc_with_extra(arg_length);
+  }
+  inline void shrink(uint32 arg_length)		// Shrink buffer
+  {
+    if (ALIGN_SIZE(arg_length+1) < Alloced_length)
     {
       char *new_ptr;
       if (!(new_ptr=(char*) my_realloc(Ptr,arg_length,MYF(0))))
@@ -272,7 +297,6 @@ public:
       DBUG_ASSERT(!s.uses_buffer_owned_by(this));
       free();
       Ptr=s.Ptr ; str_length=s.str_length ; Alloced_length=s.Alloced_length;
-      alloced=0;
     }
     return *this;
   }
@@ -288,6 +312,14 @@ public:
   bool set_or_copy_aligned(const char *s, uint32 arg_length, CHARSET_INFO *cs);
   bool copy(const char*s,uint32 arg_length, CHARSET_INFO *csfrom,
 	    CHARSET_INFO *csto, uint *errors);
+  void move(String &s)
+  {
+    free();
+    Ptr=s.Ptr ; str_length=s.str_length ; Alloced_length=s.Alloced_length;
+    extra_alloc= s.extra_alloc;
+    alloced= s.alloced;
+    s.alloced= 0;
+  }
   bool append(const String &s);
   bool append(const char *s);
   bool append(LEX_STRING *ls)
@@ -312,7 +344,7 @@ public:
     }
     else
     {
-      if (realloc(str_length+1))
+      if (realloc_with_extra(str_length + 1))
 	return 1;
       Ptr[str_length++]=chr;
     }
@@ -323,8 +355,9 @@ public:
   friend int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
   friend int stringcmp(const String *a,const String *b);
   friend String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
+  friend class Field;
   uint32 numchars();
-  int charpos(int i,uint32 offset=0);
+  int charpos(longlong i,uint32 offset=0);
 
   int reserve(uint32 space_needed)
   {
@@ -369,7 +402,7 @@ public:
 
   void qs_append(const char *str)
   {
-    qs_append(str, strlen(str));
+    qs_append(str, (uint32)strlen(str));
   }
   void qs_append(const char *str, uint32 len);
   void qs_append(double d);
@@ -438,8 +471,9 @@ public:
   }
 };
 
-static inline bool check_if_only_end_space(CHARSET_INFO *cs, char *str, 
-                                           char *end)
+static inline bool check_if_only_end_space(CHARSET_INFO *cs,
+                                           const char *str, 
+                                           const char *end)
 {
   return str+ cs->cset->scan(cs, str, end, MY_SEQ_SPACES) == end;
 }
diff --git a/sql/sql_table.cc b/sql/sql_table.cc
index c23a956fad6..2ead7a167e5 100644
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2010, 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -60,17 +61,16 @@ const char *primary_key_name="PRIMARY";
 
 static bool check_if_keyname_exists(const char *name,KEY *start, KEY *end);
 static char *make_unique_key_name(const char *field_name,KEY *start,KEY *end);
-static int copy_data_between_tables(TABLE *from,TABLE *to,
-                                    List<Create_field> &create, bool ignore,
-				    uint order_num, ORDER *order,
-				    ha_rows *copied,ha_rows *deleted,
-                                    enum enum_enable_or_disable keys_onoff,
-                                    bool error_if_not_empty);
+static int copy_data_between_tables(THD *thd, TABLE *,TABLE *,
+                                    List<Create_field> &, bool,
+				    uint, ORDER *, ha_rows *,ha_rows *,
+                                    enum enum_enable_or_disable, bool);
 
 static bool prepare_blob_field(THD *thd, Create_field *sql_field);
 static bool check_engine(THD *, const char *, HA_CREATE_INFO *);
 static int mysql_prepare_create_table(THD *, HA_CREATE_INFO *, Alter_info *,
-                              bool, uint *, handler *, KEY **, uint *, int);
+                                      bool, uint *, handler *, KEY **, uint *,
+                                      int);
 
 /**
   @brief Helper function for explain_filename
@@ -2271,7 +2271,7 @@ err:
   {
     if (!foreign_key_error)
       my_printf_error(ER_BAD_TABLE_ERROR, ER(ER_BAD_TABLE_ERROR), MYF(0),
-                      wrong_tables.c_ptr());
+                      wrong_tables.c_ptr_safe());
     else
       my_message(ER_ROW_IS_REFERENCED, ER(ER_ROW_IS_REFERENCED), MYF(0));
     error= 1;
@@ -3915,6 +3915,68 @@ static bool check_if_created_table_can_be_opened(THD *thd,
 }
 
 
+/**
+  Check that there is no frm file for given table
+
+  @param old_path        path to the old frm file
+  @param path            path to the frm file in new encoding
+  @param db              database name
+  @param table_name      table name
+  @param alias           table name for error message (for new encoding)
+  @param issue_error     should we issue error messages
+
+  @retval FALSE there is no frm file
+  @retval TRUE  there is frm file
+*/
+
+bool check_table_file_presence(char *old_path,
+                               char *path,
+                               const char *db,
+                               const char *table_name,
+                               const char *alias,
+                               bool issue_error)
+{
+  if (!access(path,F_OK))
+  {
+    if (issue_error)
+      my_error(ER_TABLE_EXISTS_ERROR,MYF(0),alias);
+    return TRUE;
+  }
+  {
+    /*
+      Check if file of the table in 5.0 file name encoding exists.
+
+      Except case when it is the same table.
+    */
+    char tbl50[FN_REFLEN];
+#ifdef _WIN32
+    if (check_if_legal_tablename(table_name) != 0)
+    {
+      /*
+       Check for reserved device names for which access() returns 0
+       (CON, AUX etc).
+      */
+      return FALSE;
+    }
+#endif
+    strxmov(tbl50, mysql_data_home, "/", db, "/", table_name, NullS);
+    fn_format(tbl50, tbl50, "", reg_ext, MY_UNPACK_FILENAME);
+    if (!access(tbl50, F_OK) &&
+        (old_path == NULL ||
+         strcmp(old_path, tbl50) != 0))
+    {
+      if (issue_error)
+      {
+        strxmov(tbl50, MYSQL50_TABLE_NAME_PREFIX, table_name, NullS);
+        my_error(ER_TABLE_EXISTS_ERROR, MYF(0), tbl50);
+      }
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+
 /*
   Create a table
 
@@ -4179,33 +4241,31 @@ bool mysql_create_table_no_lock(THD *thd,
       find_temporary_table(thd, db, table_name))
   {
     if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
-    {
-      push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
-                          ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
-                          alias);
-      error= 0;
-      goto err;
-    }
+      goto warn;
     my_error(ER_TABLE_EXISTS_ERROR, MYF(0), alias);
     goto err;
   }
 
   /* Give warnings for not supported table options */
-  if (create_info->transactional && !file->ht->commit)
-    push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                        ER_ILLEGAL_HA_CREATE_OPTION,
-                        ER(ER_ILLEGAL_HA_CREATE_OPTION),
-                        file->engine_name()->str,
-                        "TRANSACTIONAL=1");
-
+#if defined(WITH_ARIA_STORAGE_ENGINE)
+  extern handlerton *maria_hton;
+  if (file->ht != maria_hton)
+#endif
+    if (create_info->transactional)
+      push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                          ER_ILLEGAL_HA_CREATE_OPTION,
+                          ER(ER_ILLEGAL_HA_CREATE_OPTION),
+                          file->engine_name()->str,
+                          "TRANSACTIONAL=1");
 
   if (!internal_tmp_table && !(create_info->options & HA_LEX_CREATE_TMP_TABLE))
   {
-    if (!access(path,F_OK))
+    if (check_table_file_presence(NULL, path, db, table_name, table_name,
+                                  !(create_info->options &
+                                    HA_LEX_CREATE_IF_NOT_EXISTS)))
     {
       if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
         goto warn;
-      my_error(ER_TABLE_EXISTS_ERROR,MYF(0),table_name);
       goto err;
     }
     /*
@@ -4882,8 +4942,8 @@ is_index_maintenance_unique (TABLE *table, Alter_info *alter_info)
     that need to be dropped and/or (re-)created.
 
   RETURN VALUES
-    TRUE   error
-    FALSE  success
+    TRUE   The tables are not compatible; We have to do a full alter table
+    FALSE  The tables are compatible; We only have to modify the .frm
 */
 
 bool
@@ -4996,10 +5056,10 @@ mysql_compare_tables(TABLE *table,
     DBUG_RETURN(0);
   }
 
-  if ((create_info->fileds_option_struct=
-       (void**)thd->calloc(sizeof(void*) * table->s->fields)) == NULL ||
-      (create_info->indexes_option_struct=
-       (void**)thd->calloc(sizeof(void*) * table->s->keys)) == NULL)
+  if ((create_info->fields_option_struct= (ha_field_option_struct**)
+         thd->calloc(sizeof(void*) * table->s->fields)) == NULL ||
+      (create_info->indexes_option_struct= (ha_index_option_struct**)
+         thd->calloc(sizeof(void*) * table->s->keys)) == NULL)
     DBUG_RETURN(1);
 
   /*
@@ -5020,7 +5080,10 @@ mysql_compare_tables(TABLE *table,
        tmp_new_field= tmp_new_field_it++)
   {
     DBUG_ASSERT(i < table->s->fields);
-    create_info->fileds_option_struct[i]= tmp_new_field->option_struct;
+    create_info->fields_option_struct[i]= tmp_new_field->option_struct;
+
+    /* reset common markers of how field changed */
+    field->flags&= ~(FIELD_IS_RENAMED | FIELD_IN_ADD_INDEX);
 
     /* Make sure we have at least the default charset in use. */
     if (!new_field->charset)
@@ -5056,7 +5119,6 @@ mysql_compare_tables(TABLE *table,
         create_info->table_options|= HA_OPTION_PACK_RECORD;
 
     /* Check if field was renamed */
-    field->flags&= ~FIELD_IS_RENAMED;
     if (my_strcasecmp(system_charset_info,
 		      field->field_name,
 		      tmp_new_field->field_name))
@@ -5069,8 +5131,6 @@ mysql_compare_tables(TABLE *table,
                           new_field->field_name));
       DBUG_RETURN(0);
     }
-    // Clear indexed marker
-    field->flags&= ~FIELD_IN_ADD_INDEX;
     changes|= tmp;
   }
 
@@ -5401,7 +5461,7 @@ mysql_prepare_alter_table(THD *thd, TABLE *table,
   for (f_ptr=table->field ; (field= *f_ptr) ; f_ptr++)
   {
     Alter_drop *drop;
-    if (field->type() == MYSQL_TYPE_STRING)
+    if (field->type() == MYSQL_TYPE_VARCHAR)
       create_info->varchar= TRUE;
     /* Check if field should be dropped */
     drop_it.rewind();
@@ -5438,9 +5498,7 @@ mysql_prepare_alter_table(THD *thd, TABLE *table,
       def->field=field;
       if (field->stored_in_db != def->stored_in_db)
       {
-        my_error(ER_UNSUPPORTED_ACTION_ON_VIRTUAL_COLUMN,
-                 MYF(0),
-                 "Changing the STORED status");
+        my_error(ER_UNSUPPORTED_ACTION_ON_VIRTUAL_COLUMN, MYF(0));
         goto err;
       }
       if (!def->after)
@@ -5743,6 +5801,7 @@ err:
       order_num        How many ORDER BY fields has been specified.
       order            List of fields to ORDER BY.
       ignore           Whether we have ALTER IGNORE TABLE
+      require_online   Give an error if we can't do operation online
 
   DESCRIPTION
     This is a veery long function and is everything but the kitchen sink :)
@@ -5773,13 +5832,15 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
                        HA_CREATE_INFO *create_info,
                        TABLE_LIST *table_list,
                        Alter_info *alter_info,
-                       uint order_num, ORDER *order, bool ignore)
+                       uint order_num, ORDER *order, bool ignore,
+                       bool require_online)
 {
   TABLE *table, *new_table= 0;
   MDL_ticket *mdl_ticket;
   MDL_request target_mdl_request;
   int error= 0;
   char tmp_name[80],old_name[32],new_name_buff[FN_REFLEN + 1];
+  char old_name_buff[FN_REFLEN + 1];
   char new_alias_buff[FN_REFLEN], *table_name, *db, *new_alias, *alias;
   char index_file[FN_REFLEN], data_file[FN_REFLEN];
   char path[FN_REFLEN + 1];
@@ -5969,10 +6030,12 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
         */
         build_table_filename(new_name_buff, sizeof(new_name_buff) - 1,
                              new_db, new_name_buff, reg_ext, 0);
-        if (!access(new_name_buff, F_OK))
+        build_table_filename(old_name_buff, sizeof(old_name_buff) - 1,
+                             db, table_name, reg_ext, 0);
+        if (check_table_file_presence(old_name_buff, new_name_buff, new_db,
+                                      new_name, new_alias, TRUE))
 	{
 	  /* Table will be closed in do_command() */
-	  my_error(ER_TABLE_EXISTS_ERROR, MYF(0), new_alias);
           goto err;
 	}
       }
@@ -6073,8 +6136,8 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
     {
       error= 0;
       push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
-                          ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
-                          table->alias);
+			  ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
+			  table->alias.c_ptr());
     }
 
     if (!error && (new_name != table_name || new_db != db))
@@ -6124,8 +6187,8 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
     {
       error= 0;
       push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
-                          ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
-                          table->alias);
+			  ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
+			  table->alias.c_ptr());
     }
 
     if (!error)
@@ -6345,6 +6408,16 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
           /* Non-primary unique key. */
           needed_inplace_with_read_flags|= HA_INPLACE_ADD_UNIQUE_INDEX_NO_WRITE;
           needed_inplace_flags|= HA_INPLACE_ADD_UNIQUE_INDEX_NO_READ_WRITE;
+          if (ignore)
+          {
+            /*
+              If ignore is used, we have to remove all duplicate rows,
+              which require a full table copy.
+            */
+            need_copy_table= ALTER_TABLE_DATA_CHANGED;
+            pk_changed= 2;                      // Don't change need_copy_table
+            break;
+          }
         }
       }
       else
@@ -6515,10 +6588,23 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
     */
   }
 
+  /* Check if we can do the ALTER TABLE as online */
+  if (require_online)
+  {
+    if (index_add_count || index_drop_count ||
+        (new_table &&
+         !(new_table->file->ha_table_flags() & HA_NO_COPY_ON_ALTER)))
+    {
+      my_error(ER_CANT_DO_ONLINE, MYF(0), "ALTER");
+      goto err_new_table_cleanup;
+    }
+  }
+
   /* Copy the data if necessary. */
   thd->count_cuted_fields= CHECK_FIELD_WARN;	// calc cuted fields
   thd->cuted_fields=0L;
   copied=deleted=0;
+
   /*
     We do not copy data for MERGE tables. Only the children have data.
     MERGE tables have HA_NO_COPY_ON_ALTER set.
@@ -6528,12 +6614,11 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
     /* We don't want update TIMESTAMP fields during ALTER TABLE. */
     new_table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET;
     new_table->next_number_field=new_table->found_next_number_field;
-    thd_proc_info(thd, "copy to tmp table");
     DBUG_EXECUTE_IF("abort_copy_table", {
         my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
         goto err_new_table_cleanup;
       });
-    error= copy_data_between_tables(table, new_table,
+    error= copy_data_between_tables(thd, table, new_table,
                                     alter_info->create_list, ignore,
                                     order_num, order, &copied, &deleted,
                                     alter_info->keys_onoff,
@@ -6673,6 +6758,8 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name,
   }
   /*end of if (! new_table) for add/drop index*/
 
+  DBUG_ASSERT(error == 0);
+
   if (table->s->tmp_table != NO_TMP_TABLE)
   {
     /*
@@ -7028,7 +7115,7 @@ bool mysql_trans_commit_alter_copy_data(THD *thd)
 
 
 static int
-copy_data_between_tables(TABLE *from,TABLE *to,
+copy_data_between_tables(THD *thd, TABLE *from,TABLE *to,
 			 List<Create_field> &create,
                          bool ignore,
 			 uint order_num, ORDER *order,
@@ -7040,7 +7127,6 @@ copy_data_between_tables(TABLE *from,TABLE *to,
   int error= 1, errpos= 0;
   Copy_field *copy= NULL, *copy_end;
   ha_rows found_count= 0, delete_count= 0;
-  THD *thd= current_thd;
   uint length= 0;
   SORT_FIELD *sortorder;
   READ_RECORD info;
@@ -7050,11 +7136,14 @@ copy_data_between_tables(TABLE *from,TABLE *to,
   ha_rows examined_rows;
   bool auto_increment_field_copied= 0;
   ulong save_sql_mode= thd->variables.sql_mode;
-  ulonglong prev_insert_id;
+  ulonglong prev_insert_id, time_to_report_progress;
   List_iterator<Create_field> it(create);
   Create_field *def;
   DBUG_ENTER("copy_data_between_tables");
 
+  /* Two or 3 stages; Sorting, copying data and update indexes */
+  thd_progress_init(thd, 2 + test(order));
+
   if (mysql_trans_prepare_alter_copy_data(thd))
     goto err;
   errpos=1;
@@ -7103,7 +7192,8 @@ copy_data_between_tables(TABLE *from,TABLE *to,
 
   if (order)
   {
-    if (to->s->primary_key != MAX_KEY && to->file->primary_key_is_clustered())
+    if (to->s->primary_key != MAX_KEY &&
+        to->file->ha_table_flags() & HA_TABLE_SCAN_ON_INDEX)
     {
       char warn_buff[MYSQL_ERRMSG_SIZE];
       my_snprintf(warn_buff, sizeof(warn_buff), 
@@ -7121,6 +7211,7 @@ copy_data_between_tables(TABLE *from,TABLE *to,
       tables.alias= tables.table_name= from->s->table_name.str;
       tables.db= from->s->db.str;
 
+      thd_proc_info(thd, "Sorting");
       if (thd->lex->select_lex.setup_ref_array(thd, order_num) ||
           setup_order(thd, thd->lex->select_lex.ref_pointer_array,
                       &tables, fields, all_fields, order) ||
@@ -7131,8 +7222,10 @@ copy_data_between_tables(TABLE *from,TABLE *to,
           HA_POS_ERROR)
         goto err;
     }
-  };
+    thd_progress_next_stage(thd);
+  }
 
+  thd_proc_info(thd, "copy to tmp table");
   /* Tell handler that we have values for all columns in the to table */
   to->use_all_columns();
   to->mark_virtual_columns_for_write(TRUE);
@@ -7143,6 +7236,10 @@ copy_data_between_tables(TABLE *from,TABLE *to,
     to->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
   thd->warning_info->reset_current_row_for_warning();
   restore_record(to, s->default_values);        // Create empty record
+
+  thd->progress.max_counter= from->file->records();
+  time_to_report_progress= MY_HOW_OFTEN_TO_WRITE/10;
+
   while (!(error=info.read_record(&info)))
   {
     if (thd->killed)
@@ -7152,6 +7249,13 @@ copy_data_between_tables(TABLE *from,TABLE *to,
       break;
     }
     update_virtual_fields(thd, from);
+    if (++thd->progress.counter >= time_to_report_progress)
+    {
+      time_to_report_progress+= MY_HOW_OFTEN_TO_WRITE/10;
+      thd_progress_report(thd, thd->progress.counter,
+                          thd->progress.max_counter);
+    }
+
     /* Return error if source table isn't empty. */
     if (error_if_not_empty)
     {
@@ -7216,6 +7320,9 @@ err:
   free_io_cache(from);
   delete [] copy;
 
+  thd_proc_info(thd, "Enabling keys");
+  thd_progress_next_stage(thd);
+
   if (error > 0)
     to->file->extra(HA_EXTRA_PREPARE_FOR_DROP);
   if (errpos >= 3 && to->file->ha_end_bulk_insert() && error <= 0)
@@ -7278,7 +7385,7 @@ bool mysql_recreate_table(THD *thd, TABLE_LIST *table_list)
   alter_info.flags= (ALTER_CHANGE_COLUMN | ALTER_RECREATE);
   DBUG_RETURN(mysql_alter_table(thd, NullS, NullS, &create_info,
                                 table_list, &alter_info, 0,
-                                (ORDER *) 0, 0));
+                                (ORDER *) 0, 0, 0));
 }
 
 
@@ -7324,8 +7431,8 @@ bool mysql_checksum_table(THD *thd, TABLE_LIST *tables,
     {
       /* Call ->checksum() if the table checksum matches 'old_mode' settings */
       if (!(check_opt->flags & T_EXTEND) &&
-          (((t->file->ha_table_flags() & HA_HAS_OLD_CHECKSUM) && old_mode) ||
-           ((t->file->ha_table_flags() & HA_HAS_NEW_CHECKSUM) && !old_mode)))
+          (((t->file->ha_table_flags() & HA_HAS_OLD_CHECKSUM) && thd->variables.old_mode) ||
+           ((t->file->ha_table_flags() & HA_HAS_NEW_CHECKSUM) && !thd->variables.old_mode)))
 	protocol->store((ulonglong)t->file->checksum());
       else if (check_opt->flags & T_QUICK)
 	protocol->store_null();
@@ -7375,7 +7482,7 @@ bool mysql_checksum_table(THD *thd, TABLE_LIST *tables,
 	    {
 	      Field *f= t->field[i];
 
-              if (! old_mode && f->is_real_null(0))
+              if (! thd->variables.old_mode && f->is_real_null(0))
                 continue;
              /*
                BLOB and VARCHAR have pointers in their field, we must convert
diff --git a/sql/sql_table.h b/sql/sql_table.h
index 9d2f246dbf5..333fc7431c0 100644
--- a/sql/sql_table.h
+++ b/sql/sql_table.h
@@ -131,6 +131,9 @@ uint build_table_filename(char *buff, size_t bufflen, const char *db,
                           const char *table, const char *ext, uint flags);
 uint build_table_shadow_filename(char *buff, size_t bufflen,
                                  ALTER_PARTITION_PARAM_TYPE *lpt);
+bool check_table_file_presence(char *old_path, char *path, const char *db,
+                               const char *table_name, const char *alias,
+                               bool issue_error);
 bool mysql_create_table(THD *thd, TABLE_LIST *create_table,
                         HA_CREATE_INFO *create_info,
                         Alter_info *alter_info);
@@ -149,7 +152,8 @@ bool mysql_alter_table(THD *thd, char *new_db, char *new_name,
                        HA_CREATE_INFO *create_info,
                        TABLE_LIST *table_list,
                        Alter_info *alter_info,
-                       uint order_num, ORDER *order, bool ignore);
+                       uint order_num, ORDER *order, bool ignore,
+                       bool require_online);
 bool mysql_compare_tables(TABLE *table,
                           Alter_info *alter_info,
                           HA_CREATE_INFO *create_info,
diff --git a/sql/sql_test.cc b/sql/sql_test.cc
index d567624ba44..9cf16e2bc6a 100644
--- a/sql/sql_test.cc
+++ b/sql/sql_test.cc
@@ -59,19 +59,20 @@ static const char *lock_descriptions[] =
 void
 print_where(COND *cond,const char *info, enum_query_type query_type)
 {
-  char buff[256];
+  char buff[1024];
   String str(buff,(uint32) sizeof(buff), system_charset_info);
   str.length(0);
+  str.extra_allocation(1024);
   if (cond)
     cond->print(&str, query_type);
-  str.append('\0');
 
   DBUG_LOCK_FILE;
   (void) fprintf(DBUG_FILE,"\nWHERE:(%s) %p ", info, cond);
-  (void) fputs(str.ptr(),DBUG_FILE);
+  (void) fputs(str.c_ptr_safe(),DBUG_FILE);
   (void) fputc('\n',DBUG_FILE);
   DBUG_UNLOCK_FILE;
 }
+
 	/* This is for debugging purposes */
 
 
@@ -168,10 +169,9 @@ void TEST_filesort(SORT_FIELD *sortorder,uint s_length)
       out.append(str);
     }
   }
-  out.append('\0');				// Purify doesn't like c_ptr()
   DBUG_LOCK_FILE;
   (void) fputs("\nInfo about FILESORT\n",DBUG_FILE);
-  fprintf(DBUG_FILE,"Sortorder: %s\n",out.ptr());
+  fprintf(DBUG_FILE,"Sortorder: %s\n",out.c_ptr_safe());
   DBUG_UNLOCK_FILE;
   DBUG_VOID_RETURN;
 }
@@ -180,58 +180,66 @@ void TEST_filesort(SORT_FIELD *sortorder,uint s_length)
 void
 TEST_join(JOIN *join)
 {
-  uint i,ref;
+  uint ref;
+  int i;
+  List_iterator<JOIN_TAB_RANGE> it(join->join_tab_ranges);
+  JOIN_TAB_RANGE *jt_range;
   DBUG_ENTER("TEST_join");
 
-  /*
-    Assemble results of all the calls to full_name() first,
-    in order not to garble the tabular output below.
-  */
-  String ref_key_parts[MAX_TABLES];
-  for (i= 0; i < join->tables; i++)
-  {
-    JOIN_TAB *tab= join->join_tab + i;
-    for (ref= 0; ref < tab->ref.key_parts; ref++)
-    {
-      ref_key_parts[i].append(tab->ref.items[ref]->full_name());
-      ref_key_parts[i].append("  ");
-    }
-  }
-
   DBUG_LOCK_FILE;
   (void) fputs("\nInfo about JOIN\n",DBUG_FILE);
-  for (i=0 ; i < join->tables ; i++)
+  while ((jt_range= it++))
   {
-    JOIN_TAB *tab=join->join_tab+i;
-    TABLE *form=tab->table;
-    char key_map_buff[128];
-    fprintf(DBUG_FILE,"%-16.16s  type: %-7s  q_keys: %s  refs: %d  key: %d  len: %d\n",
-	    form->alias,
-	    join_type_str[tab->type],
-	    tab->keys.print(key_map_buff),
-	    tab->ref.key_parts,
-	    tab->ref.key,
-	    tab->ref.key_length);
-    if (tab->select)
+    /*
+      Assemble results of all the calls to full_name() first,
+      in order not to garble the tabular output below.
+    */
+    String ref_key_parts[MAX_TABLES];
+    int tables_in_range= jt_range->end - jt_range->start;
+    for (i= 0; i < tables_in_range; i++)
     {
-      char buf[MAX_KEY/8+1];
-      if (tab->use_quick == 2)
-	fprintf(DBUG_FILE,
-		"                  quick select checked for each record (keys: %s)\n",
-		tab->select->quick_keys.print(buf));
-      else if (tab->select->quick)
+      JOIN_TAB *tab= jt_range->start + i;
+      for (ref= 0; ref < tab->ref.key_parts; ref++)
       {
-	fprintf(DBUG_FILE, "                  quick select used:\n");
-        tab->select->quick->dbug_dump(18, FALSE);
+        ref_key_parts[i].append(tab->ref.items[ref]->full_name());
+        ref_key_parts[i].append("  ");
       }
-      else
-	(void) fputs("                  select used\n",DBUG_FILE);
     }
-    if (tab->ref.key_parts)
+
+    for (i= 0; i < tables_in_range; i++)
     {
-      fprintf(DBUG_FILE,
-              "                  refs:  %s\n", ref_key_parts[i].ptr());
+      JOIN_TAB *tab= jt_range->start + i;
+      TABLE *form=tab->table;
+      char key_map_buff[128];
+      fprintf(DBUG_FILE,"%-16.16s  type: %-7s  q_keys: %s  refs: %d  key: %d  len: %d\n",
+	    form->alias.c_ptr(),
+              join_type_str[tab->type],
+              tab->keys.print(key_map_buff),
+              tab->ref.key_parts,
+              tab->ref.key,
+              tab->ref.key_length);
+      if (tab->select)
+      {
+        char buf[MAX_KEY/8+1];
+        if (tab->use_quick == 2)
+          fprintf(DBUG_FILE,
+                  "                  quick select checked for each record (keys: %s)\n",
+                  tab->select->quick_keys.print(buf));
+        else if (tab->select->quick)
+        {
+          fprintf(DBUG_FILE, "                  quick select used:\n");
+          tab->select->quick->dbug_dump(18, FALSE);
+        }
+        else
+          (void)fputs("                  select used\n",DBUG_FILE);
+      }
+      if (tab->ref.key_parts)
+      {
+        fprintf(DBUG_FILE,
+              "                  refs:  %s\n", ref_key_parts[i].c_ptr_safe());
+      }
     }
+    (void)fputs("\n",DBUG_FILE);
   }
   DBUG_UNLOCK_FILE;
   DBUG_VOID_RETURN;
@@ -245,21 +253,25 @@ void print_keyuse(KEYUSE *keyuse)
   char buff[256];
   char buf2[64]; 
   const char *fieldname;
+  JOIN_TAB *join_tab= keyuse->table->reginfo.join_tab;
+  KEY *key_info= join_tab->get_keyinfo_by_key_no(keyuse->key);
   String str(buff,(uint32) sizeof(buff), system_charset_info);
   str.length(0);
   keyuse->val->print(&str, QT_ORDINARY);
   str.append('\0');
-  if (keyuse->keypart == FT_KEYPART)
+  if (keyuse->is_for_hash_join())
+    fieldname= keyuse->table->field[keyuse->keypart]->field_name;
+  else if (keyuse->keypart == FT_KEYPART)
     fieldname= "FT_KEYPART";
   else
-    fieldname= keyuse->table->key_info[keyuse->key].key_part[keyuse->keypart].field->field_name;
+    fieldname= key_info->key_part[keyuse->keypart].field->field_name;
   ll2str(keyuse->used_tables, buf2, 16, 0); 
   DBUG_LOCK_FILE;
-  fprintf(DBUG_FILE, "KEYUSE: %s.%s=%s  optimize= %d used_tables=%s "
-          "ref_table_rows= %lu keypart_map= %0lx\n",
-          keyuse->table->alias, fieldname, str.ptr(),
-          keyuse->optimize, buf2, (ulong)keyuse->ref_table_rows, 
-          keyuse->keypart_map);
+  fprintf(DBUG_FILE, "KEYUSE: %s.%s=%s  optimize: %u  used_tables: %s "
+          "ref_table_rows: %lu  keypart_map: %0lx\n",
+          keyuse->table->alias.c_ptr(), fieldname, str.ptr(),
+          (uint) keyuse->optimize, buf2, (ulong) keyuse->ref_table_rows, 
+          (ulong) keyuse->keypart_map);
   DBUG_UNLOCK_FILE;
   //key_part_map keypart_map; --?? there can be several? 
 }
@@ -385,7 +397,7 @@ void print_sjm(SJ_MATERIALIZATION_INFO *sjm)
   for (uint i= 0;i < sjm->tables; i++)
   {
     fprintf(DBUG_FILE, "    %s%s\n", 
-            sjm->positions[i].table->table->alias,
+            sjm->positions[i].table->table->alias.c_ptr(),
             (i == sjm->tables -1)? "": ",");
   }
   fprintf(DBUG_FILE, "  }\n");
diff --git a/sql/sql_time.cc b/sql/sql_time.cc
index de3bd35b46c..ce50fdb345b 100644
--- a/sql/sql_time.cc
+++ b/sql/sql_time.cc
@@ -1,4 +1,5 @@
 /* Copyright (C) 2000-2006 MySQL AB
+   Copyright (c) 2009-2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -252,8 +253,9 @@ to_ascii(CHARSET_INFO *cs,
 
 
 /* Character set-aware version of str_to_time() */
-bool str_to_time(CHARSET_INFO *cs, const char *str,uint length,
-                 MYSQL_TIME *l_time, int *warning)
+timestamp_type
+str_to_time(CHARSET_INFO *cs, const char *str,uint length,
+                 MYSQL_TIME *l_time, ulong fuzzydate, int *warning)
 {
   char cnv[32];
   if ((cs->state & MY_CS_NONASCII) != 0)
@@ -261,7 +263,7 @@ bool str_to_time(CHARSET_INFO *cs, const char *str,uint length,
     length= to_ascii(cs, str, length, cnv, sizeof(cnv));
     str= cnv;
   }
-  return str_to_time(str, length, l_time, warning);
+  return str_to_time(str, length, l_time, fuzzydate, warning);
 }
 
 
@@ -291,76 +293,132 @@ timestamp_type str_to_datetime(CHARSET_INFO *cs,
 timestamp_type
 str_to_datetime_with_warn(CHARSET_INFO *cs,
                           const char *str, uint length, MYSQL_TIME *l_time,
-                          uint flags)
+                          ulong flags)
 {
   int was_cut;
   THD *thd= current_thd;
   timestamp_type ts_type;
   
   ts_type= str_to_datetime(cs, str, length, l_time,
-                           (flags | (thd->variables.sql_mode &
-                                     (MODE_INVALID_DATES |
-                                      MODE_NO_ZERO_DATE))),
+                           (flags | (sql_mode_for_dates(thd))),
                            &was_cut);
   if (was_cut || ts_type <= MYSQL_TIMESTAMP_ERROR)
-    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                 str, length, ts_type,  NullS);
+    make_truncated_value_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+                                 str, length, flags & TIME_TIME_ONLY ?
+                                 MYSQL_TIMESTAMP_TIME : ts_type, NullS);
   return ts_type;
 }
 
 
-/*
-  Convert a datetime from broken-down MYSQL_TIME representation to corresponding 
-  TIMESTAMP value.
+/**
+  converts a pair of numbers (integer part, microseconds) to MYSQL_TIME
 
-  SYNOPSIS
-    TIME_to_timestamp()
-      thd             - current thread
-      t               - datetime in broken-down representation, 
-      in_dst_time_gap - pointer to bool which is set to true if t represents
-                        value which doesn't exists (falls into the spring 
-                        time-gap) or to false otherwise.
-   
-  RETURN
-     Number seconds in UTC since start of Unix Epoch corresponding to t.
-     0 - t contains datetime value which is out of TIMESTAMP range.
-     
+  @param neg           sign of the time value
+  @param nr            integer part of the number to convert
+  @param sec_part      microsecond part of the number
+  @param ltime         converted value will be written here
+  @param fuzzydate     conversion flags (TIME_FUZZY_DATE, etc)
+  @param str           original number, as an ErrConv. For the warning
+  @param field_name    field name or NULL if not a field. For the warning
+  
+  @returns 0 for success, 1 for a failure
 */
-my_time_t TIME_to_timestamp(THD *thd, const MYSQL_TIME *t, my_bool *in_dst_time_gap)
+static bool number_to_time_with_warn(bool neg, ulonglong nr, ulong sec_part,
+                                     MYSQL_TIME *ltime, ulong fuzzydate,
+                                     const ErrConv *str,
+                                     const char *field_name)
 {
-  my_time_t timestamp;
+  int was_cut;
+  longlong res;
+  enum_field_types f_type;
 
-  *in_dst_time_gap= 0;
-  thd->time_zone_used= 1;
+  if (fuzzydate & TIME_TIME_ONLY)
+  {
+    f_type= MYSQL_TYPE_TIME;
+    res= number_to_time(neg, nr, sec_part, ltime, &was_cut);
+  }
+  else
+  {
+    f_type= MYSQL_TYPE_DATETIME;
+    res= neg ? -1 : number_to_datetime(nr, sec_part, ltime, fuzzydate, &was_cut);
+  }
 
-  timestamp= thd->variables.time_zone->TIME_to_gmt_sec(t, in_dst_time_gap);
-  if (timestamp)
+  if (res < 0 || (was_cut && !(fuzzydate & TIME_FUZZY_DATE)))
   {
-    return timestamp;
+    make_truncated_value_warning(current_thd,
+                                 MYSQL_ERROR::WARN_LEVEL_WARN, str,
+                                 res < 0 ? MYSQL_TIMESTAMP_ERROR
+                                         : mysql_type_to_time_type(f_type),
+                                 field_name);
   }
+  return res < 0;
+}
+
 
-  /* If we are here we have range error. */
-  return(0);
+bool double_to_datetime_with_warn(double value, MYSQL_TIME *ltime,
+                                  ulong fuzzydate, const char *field_name)
+{
+  const ErrConvDouble str(value);
+  bool neg= value < 0;
+
+  if (neg)
+    value= -value;
+
+  if (value > LONGLONG_MAX)
+    value= static_cast<double>(LONGLONG_MAX);
+
+  longlong nr= static_cast<ulonglong>(floor(value));
+  uint sec_part= static_cast<ulong>((value - floor(value))*TIME_SECOND_PART_FACTOR);
+  return number_to_time_with_warn(neg, nr, sec_part, ltime, fuzzydate, &str,
+                                  field_name);
+}
+
+
+bool decimal_to_datetime_with_warn(const my_decimal *value, MYSQL_TIME *ltime,
+                                   ulong fuzzydate, const char *field_name)
+{
+  const ErrConvDecimal str(value);
+  ulonglong nr;
+  ulong sec_part;
+  bool neg= my_decimal2seconds(value, &nr, &sec_part);
+  return number_to_time_with_warn(neg, nr, sec_part, ltime, fuzzydate, &str,
+                                  field_name);
+}
+
+
+bool int_to_datetime_with_warn(longlong value, MYSQL_TIME *ltime,
+                               ulong fuzzydate, const char *field_name)
+{
+  const ErrConvInteger str(value);
+  bool neg= value < 0;
+  return number_to_time_with_warn(neg, neg ? -value : value, 0, ltime,
+                                  fuzzydate, &str, field_name);
 }
 
 
 /*
-  Convert a time string to a MYSQL_TIME struct and produce a warning
-  if string was cut during conversion.
+  Convert a datetime from broken-down MYSQL_TIME representation to
+  corresponding TIMESTAMP value.
 
-  NOTE
-    See str_to_time() for more info.
+  SYNOPSIS
+    TIME_to_timestamp()
+      thd             - current thread
+      t               - datetime in broken-down representation, 
+      error_code      - 0, if the conversion was successful;
+                        ER_WARN_DATA_OUT_OF_RANGE, if t contains datetime value
+                           which is out of TIMESTAMP range;
+                        ER_WARN_INVALID_TIMESTAMP, if t represents value which
+                           doesn't exists (falls into the spring time-gap).
+   
+  RETURN
+     Number seconds in UTC since start of Unix Epoch corresponding to t.
+     0 - in case of ER_WARN_DATA_OUT_OF_RANGE
 */
-bool
-str_to_time_with_warn(CHARSET_INFO *cs,
-                      const char *str, uint length, MYSQL_TIME *l_time)
+
+my_time_t TIME_to_timestamp(THD *thd, const MYSQL_TIME *t, uint *error_code)
 {
-  int warning;
-  bool ret_val= str_to_time(str, length, l_time, &warning);
-  if (ret_val || warning)
-    make_truncated_value_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
-                                 str, length, MYSQL_TIMESTAMP_TIME, NullS);
-  return ret_val;
+  thd->time_zone_used= 1;
+  return thd->variables.time_zone->TIME_to_gmt_sec(t, error_code);
 }
 
 
@@ -725,11 +783,6 @@ KNOWN_DATE_TIME_FORMAT known_date_time_formats[6]=
 };
 
 
-/*
-   Return format string according format name.
-   If name is unknown, result is NULL
-*/
-
 const char *get_date_time_format_str(KNOWN_DATE_TIME_FORMAT *format,
 				     timestamp_type type)
 {
@@ -746,60 +799,15 @@ const char *get_date_time_format_str(KNOWN_DATE_TIME_FORMAT *format,
   }
 }
 
-/****************************************************************************
-  Functions to create default time/date/datetime strings
- 
-  NOTE:
-    For the moment the DATE_TIME_FORMAT argument is ignored becasue
-    MySQL doesn't support comparing of date/time/datetime strings that
-    are not in arbutary order as dates are compared as strings in some
-    context)
-    This functions don't check that given MYSQL_TIME structure members are
-    in valid range. If they are not, return value won't reflect any 
-    valid date either. Additionally, make_time doesn't take into
-    account time->day member: it's assumed that days have been converted
-    to hours already.
-****************************************************************************/
-
-void make_time(const DATE_TIME_FORMAT *format __attribute__((unused)),
-               const MYSQL_TIME *l_time, String *str)
-{
-  uint length= (uint) my_time_to_str(l_time, (char*) str->ptr());
-  str->length(length);
-  str->set_charset(&my_charset_numeric);
-}
-
-
-void make_date(const DATE_TIME_FORMAT *format __attribute__((unused)),
-               const MYSQL_TIME *l_time, String *str)
-{
-  uint length= (uint) my_date_to_str(l_time, (char*) str->ptr());
-  str->length(length);
-  str->set_charset(&my_charset_numeric);
-}
-
-
-void make_datetime(const DATE_TIME_FORMAT *format __attribute__((unused)),
-                   const MYSQL_TIME *l_time, String *str)
-{
-  uint length= (uint) my_datetime_to_str(l_time, (char*) str->ptr());
-  str->length(length);
-  str->set_charset(&my_charset_numeric);
-}
-
-
-void make_truncated_value_warning(THD *thd, MYSQL_ERROR::enum_warning_level level,
-                                  const char *str_val,
-				  uint str_length, timestamp_type time_type,
+void make_truncated_value_warning(THD *thd,
+                                  MYSQL_ERROR::enum_warning_level level,
+                                  const ErrConv *sval,
+				  timestamp_type time_type,
                                   const char *field_name)
 {
   char warn_buff[MYSQL_ERRMSG_SIZE];
   const char *type_str;
   CHARSET_INFO *cs= &my_charset_latin1;
-  char buff[128];
-  String str(buff,(uint32) sizeof(buff), system_charset_info);
-  str.copy(str_val, str_length, system_charset_info);
-  str[str_length]= 0;               // Ensure we have end 0 for snprintf
 
   switch (time_type) {
     case MYSQL_TIMESTAMP_DATE: 
@@ -816,32 +824,37 @@ void make_truncated_value_warning(THD *thd, MYSQL_ERROR::enum_warning_level leve
   if (field_name)
     cs->cset->snprintf(cs, warn_buff, sizeof(warn_buff),
                        ER(ER_TRUNCATED_WRONG_VALUE_FOR_FIELD),
-                       type_str, str.c_ptr(), field_name,
+                       type_str, sval->ptr(), field_name,
                        (ulong) thd->warning_info->current_row_for_warning());
   else
   {
     if (time_type > MYSQL_TIMESTAMP_ERROR)
       cs->cset->snprintf(cs, warn_buff, sizeof(warn_buff),
                          ER(ER_TRUNCATED_WRONG_VALUE),
-                         type_str, str.c_ptr());
+                         type_str, sval->ptr());
     else
       cs->cset->snprintf(cs, warn_buff, sizeof(warn_buff),
-                         ER(ER_WRONG_VALUE), type_str, str.c_ptr());
+                         ER(ER_WRONG_VALUE), type_str, sval->ptr());
   }
   push_warning(thd, level,
                ER_TRUNCATED_WRONG_VALUE, warn_buff);
 }
 
+
 /* Daynumber from year 0 to 9999-12-31 */
 #define MAX_DAY_NUMBER 3652424L
-
-bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type, INTERVAL interval)
+#define COMBINE(X)                                                      \
+               (((((X)->day * 24LL + (X)->hour) * 60LL +                \
+                   (X)->minute) * 60LL + (X)->second)*1000000LL +       \
+                   (X)->second_part)
+#define GET_PART(X, N) X % N ## LL; X/= N ## LL
+
+bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type,
+                       INTERVAL interval)
 {
   long period, sign;
 
-  ltime->neg= 0;
-
-  sign= (interval.neg ? -1 : 1);
+  sign= (interval.neg == ltime->neg ? 1 : -1);
 
   switch (int_type) {
   case INTERVAL_SECOND:
@@ -858,35 +871,43 @@ bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type, INTERVAL inter
   case INTERVAL_DAY_SECOND:
   case INTERVAL_DAY_MINUTE:
   case INTERVAL_DAY_HOUR:
+  case INTERVAL_DAY:
   {
-    longlong sec, days, daynr, microseconds, extra_sec;
-    ltime->time_type= MYSQL_TIMESTAMP_DATETIME; // Return full date
-    microseconds= ltime->second_part + sign*interval.second_part;
-    extra_sec= microseconds/1000000L;
-    microseconds= microseconds%1000000L;
-
-    sec=((ltime->day-1)*3600*24L+ltime->hour*3600+ltime->minute*60+
-	 ltime->second +
-	 sign* (longlong) (interval.day*3600*24L +
-                           interval.hour*LL(3600)+interval.minute*LL(60)+
-                           interval.second))+ extra_sec;
-    if (microseconds < 0)
+    longlong usec, daynr;
+    my_bool neg= 0;
+    enum enum_mysql_timestamp_type time_type= ltime->time_type;
+
+    if (time_type != MYSQL_TIMESTAMP_TIME)
+      ltime->day+= calc_daynr(ltime->year, ltime->month, 1) - 1;
+
+    usec= COMBINE(ltime) + sign*COMBINE(&interval);
+
+    if (usec < 0)
     {
-      microseconds+= LL(1000000);
-      sec--;
+      neg= 1;
+      usec= -usec;
     }
-    days= sec/(3600*LL(24));
-    sec-= days*3600*LL(24);
-    if (sec < 0)
+
+    ltime->second_part= GET_PART(usec, 1000000);
+    ltime->second= GET_PART(usec, 60);
+    ltime->minute= GET_PART(usec, 60);
+    ltime->neg^= neg;
+
+    if (time_type == MYSQL_TIMESTAMP_TIME)
     {
-      days--;
-      sec+= 3600*LL(24);
+      if (usec > TIME_MAX_HOUR)
+        goto invalid_date;
+      ltime->hour= static_cast<uint>(usec);
+      ltime->day= 0;
+      return 0;
     }
-    ltime->second_part= (uint) microseconds;
-    ltime->second= (uint) (sec % 60);
-    ltime->minute= (uint) (sec/60 % 60);
-    ltime->hour=   (uint) (sec/3600);
-    daynr= calc_daynr(ltime->year,ltime->month,1) + days;
+
+    if (int_type != INTERVAL_DAY)
+      ltime->time_type= MYSQL_TIMESTAMP_DATETIME; // Return full date
+
+    ltime->hour= GET_PART(usec, 24);
+    daynr= usec;
+
     /* Day number from year 0 to 9999-12-31 */
     if ((ulonglong) daynr > MAX_DAY_NUMBER)
       goto invalid_date;
@@ -894,7 +915,6 @@ bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type, INTERVAL inter
                         &ltime->day);
     break;
   }
-  case INTERVAL_DAY:
   case INTERVAL_WEEK:
     period= (calc_daynr(ltime->year,ltime->month,ltime->day) +
              sign * (long) interval.day);
@@ -932,13 +952,15 @@ bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type, INTERVAL inter
     goto null_date;
   }
 
-  return 0;					// Ok
+  if (ltime->time_type != MYSQL_TIMESTAMP_TIME)
+    return 0;                                   // Ok
 
 invalid_date:
   push_warning_printf(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                       ER_DATETIME_FUNCTION_OVERFLOW,
                       ER(ER_DATETIME_FUNCTION_OVERFLOW),
-                      "datetime");
+                      ltime->time_type == MYSQL_TIMESTAMP_TIME ?
+                      "time" : "datetime");
 null_date:
   return 1;
 }
@@ -1038,19 +1060,14 @@ calc_time_diff(MYSQL_TIME *l_time1, MYSQL_TIME *l_time2, int l_sign, longlong *s
 
 int my_time_compare(MYSQL_TIME *a, MYSQL_TIME *b)
 {
-  ulonglong a_t= TIME_to_ulonglong_datetime(a);
-  ulonglong b_t= TIME_to_ulonglong_datetime(b);
+  ulonglong a_t= pack_time(a);
+  ulonglong b_t= pack_time(b);
 
   if (a_t < b_t)
     return -1;
   if (a_t > b_t)
     return 1;
 
-  if (a->second_part < b->second_part)
-    return -1;
-  if (a->second_part > b->second_part)
-    return 1;
-
   return 0;
 }
 
diff --git a/sql/sql_time.h b/sql/sql_time.h
index 47e1a2b4843..937d10f5b74 100644
--- a/sql/sql_time.h
+++ b/sql/sql_time.h
@@ -34,17 +34,36 @@ typedef struct st_known_date_time_format KNOWN_DATE_TIME_FORMAT;
 ulong convert_period_to_month(ulong period);
 ulong convert_month_to_period(ulong month);
 void get_date_from_daynr(long daynr,uint *year, uint *month, uint *day);
-my_time_t TIME_to_timestamp(THD *thd, const MYSQL_TIME *t, my_bool *not_exist);
+my_time_t TIME_to_timestamp(THD *thd, const MYSQL_TIME *t, uint *error_code);
 bool str_to_time_with_warn(CHARSET_INFO *cs, const char *str, uint length,
-                           MYSQL_TIME *l_time);
+                           MYSQL_TIME *l_time, ulong fuzzydate);
 timestamp_type str_to_datetime_with_warn(CHARSET_INFO *cs, const char *str,
                                          uint length, MYSQL_TIME *l_time,
-                                         uint flags);
-void make_truncated_value_warning(THD *thd,
-                                  MYSQL_ERROR::enum_warning_level level,
-                                  const char *str_val, uint str_length,
+                                         ulong flags);
+bool double_to_datetime_with_warn(double value, MYSQL_TIME *ltime,
+                                  ulong fuzzydate,
+                                  const char *name);
+bool decimal_to_datetime_with_warn(const my_decimal *value, MYSQL_TIME *ltime,
+                                   ulong fuzzydate,
+                                   const char *name);
+bool int_to_datetime_with_warn(longlong value, MYSQL_TIME *ltime,
+                               ulong fuzzydate,
+                               const char *name);
+
+void make_truncated_value_warning(THD *thd, MYSQL_ERROR::enum_warning_level level,
+                                  const ErrConv *str_val,
                                   timestamp_type time_type,
                                   const char *field_name);
+
+static inline void make_truncated_value_warning(THD *thd,
+                MYSQL_ERROR::enum_warning_level level, const char *str_val,
+                uint str_length, timestamp_type time_type,
+                const char *field_name)
+{
+  const ErrConvString str(str_val, str_length, &my_charset_bin);
+  make_truncated_value_warning(thd, level, &str, time_type, field_name);
+}
+
 extern DATE_TIME_FORMAT *date_time_format_make(timestamp_type format_type,
 					       const char *format_str,
 					       uint format_length);
@@ -52,13 +71,6 @@ extern DATE_TIME_FORMAT *date_time_format_copy(THD *thd,
 					       DATE_TIME_FORMAT *format);
 const char *get_date_time_format_str(KNOWN_DATE_TIME_FORMAT *format,
 				     timestamp_type type);
-void make_date(const DATE_TIME_FORMAT *format, const MYSQL_TIME *l_time,
-               String *str);
-void make_time(const DATE_TIME_FORMAT *format, const MYSQL_TIME *l_time,
-               String *str);
-void make_datetime(const DATE_TIME_FORMAT *format, const MYSQL_TIME *l_time,
-                   String *str);
-
 /* MYSQL_TIME operations */
 bool date_add_interval(MYSQL_TIME *ltime, interval_type int_type,
                        INTERVAL interval);
@@ -74,8 +86,8 @@ bool parse_date_time_format(timestamp_type format_type,
                             const char *format, uint format_length,
                             DATE_TIME_FORMAT *date_time_format);
 /* Character set-aware version of str_to_time() */
-bool str_to_time(CHARSET_INFO *cs, const char *str,uint length,
-                 MYSQL_TIME *l_time, int *warning);
+timestamp_type str_to_time(CHARSET_INFO *cs, const char *str,uint length,
+                 MYSQL_TIME *l_time, ulong fuzzydate, int *warning);
 /* Character set-aware version of str_to_datetime() */
 timestamp_type str_to_datetime(CHARSET_INFO *cs,
                                const char *str, uint length,
diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc
index 756046ebc96..526bc9f65bd 100644
--- a/sql/sql_trigger.cc
+++ b/sql/sql_trigger.cc
@@ -874,7 +874,7 @@ bool Table_triggers_list::create_trigger(THD *thd, TABLE_LIST *tables,
 
   stmt_query->append(stmt_definition.str, stmt_definition.length);
 
-  trg_def->str= stmt_query->c_ptr();
+  trg_def->str= stmt_query->c_ptr_safe();
   trg_def->length= stmt_query->length();
 
   /* Create trigger definition file. */
@@ -1111,10 +1111,7 @@ void Table_triggers_list::set_table(TABLE *new_table)
 {
   trigger_table= new_table;
   for (Field **field= new_table->triggers->record1_field ; *field ; field++)
-  {
-    (*field)->table= (*field)->orig_table= new_table;
-    (*field)->table_name= &new_table->alias;
-  }
+    (*field)->init(new_table);
 }
 
 
diff --git a/sql/sql_trigger.h b/sql/sql_trigger.h
index c98f5d72a58..c2afa900b74 100644
--- a/sql/sql_trigger.h
+++ b/sql/sql_trigger.h
@@ -52,7 +52,7 @@ enum trg_action_time_type
 /**
   This class holds all information about triggers of table.
 
-  QQ: Will it be merged into TABLE in the future ?
+  TODO: Will it be merged into TABLE in the future ?
 */
 
 class Table_triggers_list: public Sql_alloc
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 3382eba260c..200140b5f6a 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -52,9 +52,8 @@ int select_union::prepare(List<Item> &list, SELECT_LEX_UNIT *u)
 }
 
 
-bool select_union::send_data(List<Item> &values)
+int select_union::send_data(List<Item> &values)
 {
-  int error= 0;
   if (unit->offset_limit_cnt)
   {						// using limit offset,count
     unit->offset_limit_cnt--;
@@ -64,14 +63,22 @@ bool select_union::send_data(List<Item> &values)
   if (thd->is_error())
     return 1;
 
-  if ((error= table->file->ha_write_row(table->record[0])))
+  if ((write_err= table->file->ha_write_tmp_row(table->record[0])))
   {
+    if (write_err == HA_ERR_FOUND_DUPP_KEY)
+    {
+      /*
+        Inform upper level that we found a duplicate key, that should not
+        be counted as part of limit
+      */
+      return -1;
+    }
     /* create_internal_tmp_table_from_heap will generate error if needed */
-    if (table->file->is_fatal_error(error, HA_CHECK_DUP) &&
+    if (table->file->is_fatal_error(write_err, HA_CHECK_DUP) &&
         create_internal_tmp_table_from_heap(thd, table,
                                             tmp_table_param.start_recinfo, 
-                                            &tmp_table_param.recinfo, error,
-                                            1))
+                                            &tmp_table_param.recinfo,
+                                            write_err, 1))
       return 1;
   }
   return 0;
@@ -108,6 +115,7 @@ bool select_union::flush()
       options            create options
       table_alias        name of the temporary table
       bit_fields_as_long convert bit fields to ulonglong
+      create_table       whether to physically create result table
 
   DESCRIPTION
     Create a temporary table that is used to store the result of a UNION,
@@ -122,7 +130,7 @@ bool
 select_union::create_result_table(THD *thd_arg, List<Item> *column_types,
                                   bool is_union_distinct, ulonglong options,
                                   const char *alias,
-                                   bool bit_fields_as_long)
+                                  bool bit_fields_as_long, bool create_table)
 {
   DBUG_ASSERT(table == 0);
   tmp_table_param.init();
@@ -131,10 +139,19 @@ select_union::create_result_table(THD *thd_arg, List<Item> *column_types,
 
   if (! (table= create_tmp_table(thd_arg, &tmp_table_param, *column_types,
                                  (ORDER*) 0, is_union_distinct, 1,
-                                 options, HA_POS_ERROR, alias)))
+                                 options, HA_POS_ERROR, alias,
+                                 !create_table)))
     return TRUE;
-  table->file->extra(HA_EXTRA_WRITE_CACHE);
-  table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+
+  table->keys_in_use_for_query.clear_all();
+  for (uint i=0; i < table->s->fields; i++)
+    table->field[i]->flags &= ~PART_KEY_FLAG;
+
+  if (create_table)
+  {
+    table->file->extra(HA_EXTRA_WRITE_CACHE);
+    table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
+  }
   return FALSE;
 }
 
@@ -188,6 +205,8 @@ st_select_lex_unit::init_prepare_fake_select_lex(THD *thd_arg)
   {
     (*order->item)->walk(&Item::change_context_processor, 0,
                          (uchar*) &fake_select_lex->context);
+    (*order->item)->walk(&Item::set_fake_select_as_master_processor, 0,
+                         (uchar*) fake_select_lex);
   }
 }
 
@@ -272,6 +291,18 @@ bool st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result,
 
     can_skip_order_by= is_union_select && !(sl->braces && sl->explicit_limit);
 
+    /*
+      Remove all references from the select_lex_units to the subqueries that
+      are inside the ORDER BY clause.
+    */
+    if (can_skip_order_by)
+    {
+      for (ORDER *ord= (ORDER *)sl->order_list.first; ord; ord= ord->next)
+      {
+        (*ord->item)->walk(&Item::eliminate_subselect_processor, FALSE, NULL);
+      }
+    }
+
     saved_error= join->prepare(&sl->ref_pointer_array,
                                sl->table_list.first,
                                sl->with_wild,
@@ -286,6 +317,7 @@ bool st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result,
                                (is_union_select ? NULL :
                                 thd_arg->lex->proc_list.first),
                                sl, this);
+
     /* There are no * in the statement anymore (for PS) */
     sl->with_wild= 0;
     last_procedure= join->procedure;
@@ -340,6 +372,9 @@ bool st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result,
     List_iterator_fast<Item> tp(types);
     Item *type;
     ulonglong create_options;
+    uint save_tablenr= 0;
+    table_map save_map= 0;
+    uint save_maybe_null= 0;
 
     while ((type= tp++))
     {
@@ -392,12 +427,24 @@ bool st_select_lex_unit::prepare(THD *thd_arg, select_result *sel_result,
       create_options= create_options | TMP_TABLE_FORCE_MYISAM;
 
     if (union_result->create_result_table(thd, &types, test(union_distinct),
-                                          create_options, "", FALSE))
+                                          create_options, "", FALSE, TRUE))
       goto err;
+    if (fake_select_lex && !fake_select_lex->first_cond_optimization)
+    {
+      save_tablenr= result_table_list.tablenr_exec;
+      save_map= result_table_list.map_exec;
+      save_maybe_null= result_table_list.maybe_null_exec;
+    }
     bzero((char*) &result_table_list, sizeof(result_table_list));
     result_table_list.db= (char*) "";
     result_table_list.table_name= result_table_list.alias= (char*) "union";
     result_table_list.table= table= union_result->table;
+    if (fake_select_lex && !fake_select_lex->first_cond_optimization)
+    {
+      result_table_list.tablenr_exec= save_tablenr;
+      result_table_list.map_exec= save_map;
+      result_table_list.maybe_null_exec= save_maybe_null;
+    }
 
     thd_arg->lex->current_select= lex_select_save;
     if (!item_list.elements)
@@ -462,18 +509,21 @@ err:
 }
 
 
-bool st_select_lex_unit::exec()
+/**
+  Run optimization phase.
+
+  @return FALSE unit successfully passed optimization phase.
+  @return TRUE an error occur.
+*/
+bool st_select_lex_unit::optimize()
 {
   SELECT_LEX *lex_select_save= thd->lex->current_select;
   SELECT_LEX *select_cursor=first_select();
-  ulonglong add_rows=0;
-  ha_rows examined_rows= 0;
-  DBUG_ENTER("st_select_lex_unit::exec");
+  DBUG_ENTER("st_select_lex_unit::optimize");
 
-  if (executed && !uncacheable && !describe)
+  if (optimized && !uncacheable && !describe)
     DBUG_RETURN(FALSE);
-  executed= 1;
-  
+
   if (uncacheable || !item || !item->assigned() || describe)
   {
     if (item)
@@ -494,7 +544,6 @@ bool st_select_lex_unit::exec()
     }
     for (SELECT_LEX *sl= select_cursor; sl; sl= sl->next_select())
     {
-      ha_rows records_at_start= 0;
       thd->lex->current_select= sl;
 
       if (optimized)
@@ -521,6 +570,66 @@ bool st_select_lex_unit::exec()
         sl->join->select_options= 
           (select_limit_cnt == HA_POS_ERROR || sl->braces) ?
           sl->options & ~OPTION_FOUND_ROWS : sl->options | found_rows_for_union;
+
+	saved_error= sl->join->optimize();
+      }
+
+      if (saved_error)
+      {
+	thd->lex->current_select= lex_select_save;
+	DBUG_RETURN(saved_error);
+      }
+    }
+  }
+  optimized= 1;
+
+  thd->lex->current_select= lex_select_save;
+  DBUG_RETURN(saved_error);
+}
+
+
+bool st_select_lex_unit::exec()
+{
+  SELECT_LEX *lex_select_save= thd->lex->current_select;
+  SELECT_LEX *select_cursor=first_select();
+  ulonglong add_rows=0;
+  ha_rows examined_rows= 0;
+  DBUG_ENTER("st_select_lex_unit::exec");
+
+  if (executed && !uncacheable && !describe)
+    DBUG_RETURN(FALSE);
+  executed= 1;
+  
+  saved_error= optimize();
+
+  if (uncacheable || !item || !item->assigned() || describe)
+  {
+    for (SELECT_LEX *sl= select_cursor; sl; sl= sl->next_select())
+    {
+      ha_rows records_at_start= 0;
+      thd->lex->current_select= sl;
+
+      {
+        set_limit(sl);
+	if (sl == global_parameters || describe)
+	{
+	  offset_limit_cnt= 0;
+	  /*
+	    We can't use LIMIT at this stage if we are using ORDER BY for the
+	    whole query
+	  */
+	  if (sl->order_list.first || describe)
+	    select_limit_cnt= HA_POS_ERROR;
+        }
+
+        /*
+          When using braces, SQL_CALC_FOUND_ROWS affects the whole query:
+          we don't calculate found_rows() per union part.
+          Otherwise, SQL_CALC_FOUND_ROWS should be done on all sub parts.
+        */
+        sl->join->select_options= 
+          (select_limit_cnt == HA_POS_ERROR || sl->braces) ?
+          sl->options & ~OPTION_FOUND_ROWS : sl->options | found_rows_for_union;
 	saved_error= sl->join->optimize();
       }
       if (!saved_error)
@@ -573,7 +682,6 @@ bool st_select_lex_unit::exec()
       }
     }
   }
-  optimized= 1;
 
   /* Send result to 'result' */
   saved_error= TRUE;
@@ -695,7 +803,8 @@ bool st_select_lex_unit::cleanup()
     if ((join= fake_select_lex->join))
     {
       join->tables_list= 0;
-      join->tables= 0;
+      join->table_count= 0;
+      join->top_join_tab_count= 0;
     }
     error|= fake_select_lex->cleanup();
     /*
@@ -848,3 +957,27 @@ void st_select_lex::cleanup_all_joins(bool full)
     for (sl= unit->first_select(); sl; sl= sl->next_select())
       sl->cleanup_all_joins(full);
 }
+
+
+/**
+  Set exclude_from_table_unique_test for selects of this unit and all
+  underlying selects.
+
+  @note used to exclude materialized derived tables (views) from unique
+  table check.
+*/
+
+void st_select_lex_unit::set_unique_exclude()
+{
+  for (SELECT_LEX *sl= first_select(); sl; sl= sl->next_select())
+  {
+    sl->exclude_from_table_unique_test= TRUE;
+    for (SELECT_LEX_UNIT *unit= sl->first_inner_unit();
+         unit;
+         unit= unit->next_unit())
+    {
+      unit->set_unique_exclude();
+    }
+  }
+}
+
diff --git a/sql/sql_update.cc b/sql/sql_update.cc
index 0f470062981..5cb8d085c33 100644
--- a/sql/sql_update.cc
+++ b/sql/sql_update.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
    Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
@@ -46,7 +46,7 @@
 
 /**
    True if the table's input and output record buffers are comparable using
-   compare_records(TABLE*).
+   compare_record(TABLE*).
  */
 bool records_are_comparable(const TABLE *table) {
   return ((table->file->ha_table_flags() & HA_PARTIAL_COLUMN_READ) == 0) ||
@@ -56,18 +56,13 @@ bool records_are_comparable(const TABLE *table) {
 
 /**
    Compares the input and outbut record buffers of the table to see if a row
-   has changed. The algorithm iterates over updated columns and if they are
-   nullable compares NULL bits in the buffer before comparing actual
-   data. Special care must be taken to compare only the relevant NULL bits and
-   mask out all others as they may be undefined. The storage engine will not
-   and should not touch them.
-
-   @param table The table to evaluate.
+   has changed.
 
    @return true if row has changed.
    @return false otherwise.
 */
-bool compare_records(const TABLE *table)
+
+bool compare_record(const TABLE *table)
 {
   DBUG_ASSERT(records_are_comparable(table));
 
@@ -104,7 +99,6 @@ bool compare_records(const TABLE *table)
      comparison done above.
   */ 
   if (table->s->can_cmp_whole_record)
-    // Fixed-size record: do bitwise comparison of the records 
     return cmp_record(table,record[1]);
   /* Compare null bits */
   if (memcmp(table->null_flags,
@@ -262,6 +256,7 @@ int mysql_update(THD *thd,
   bool		using_limit= limit != HA_POS_ERROR;
   bool		safe_update= test(thd->variables.option_bits & OPTION_SAFE_UPDATES);
   bool          used_key_is_modified= FALSE, transactional_table, will_batch;
+  bool		can_compare_record;
   int           res;
   int		error, loc_error;
   uint          used_index, dup_key_found;
@@ -285,7 +280,11 @@ int mysql_update(THD *thd,
   if (open_tables(thd, &table_list, &table_count, 0))
     DBUG_RETURN(1);
 
-  if (table_list->multitable_view)
+  //Prepare views so they are handled correctly.
+  if (mysql_handle_derived(thd->lex, DT_INIT))
+    DBUG_RETURN(1);
+
+  if (table_list->is_multitable())
   {
     DBUG_ASSERT(table_list->view != 0);
     DBUG_PRINT("info", ("Switch to multi-update"));
@@ -297,20 +296,19 @@ int mysql_update(THD *thd,
   if (lock_tables(thd, table_list, table_count, 0))
     DBUG_RETURN(1);
 
-  if (mysql_handle_derived(thd->lex, &mysql_derived_prepare))
+  if (table_list->handle_derived(thd->lex, DT_MERGE_FOR_INSERT))
     DBUG_RETURN(1);
-
-  if (thd->fill_derived_tables() &&
-      mysql_handle_derived(thd->lex, &mysql_derived_filling))
-  {
-    mysql_handle_derived(thd->lex, &mysql_derived_cleanup);
+  if (table_list->handle_derived(thd->lex, DT_PREPARE))
     DBUG_RETURN(1);
-  }
-  mysql_handle_derived(thd->lex, &mysql_derived_cleanup);
 
   thd_proc_info(thd, "init");
   table= table_list->table;
 
+  if (!table_list->updatable)
+  {
+    my_error(ER_NON_UPDATABLE_TABLE, MYF(0), table_list->alias, "UPDATE");
+    DBUG_RETURN(1);
+  }
   /* Calculate "table->covering_keys" based on the WHERE */
   table->covering_keys= table->s->keys_in_use;
   table->quick_keys.clear_all();
@@ -329,13 +327,17 @@ int mysql_update(THD *thd,
   table_list->grant.want_privilege= table->grant.want_privilege= want_privilege;
   table_list->register_want_access(want_privilege);
 #endif
+  /* 'Unfix' fields to allow correct marking by the setup_fields function. */
+  if (table_list->is_view())
+    unfix_fields(fields);
+
   if (setup_fields_with_no_wrap(thd, 0, fields, MARK_COLUMNS_WRITE, 0, 0))
     DBUG_RETURN(1);                     /* purecov: inspected */
   if (table_list->view && check_fields(thd, fields))
   {
     DBUG_RETURN(1);
   }
-  if (!table_list->updatable || check_key_in_view(thd, table_list))
+  if (check_key_in_view(thd, table_list))
   {
     my_error(ER_NON_UPDATABLE_TABLE, MYF(0), table_list->alias, "UPDATE");
     DBUG_RETURN(1);
@@ -365,6 +367,10 @@ int mysql_update(THD *thd,
     DBUG_RETURN(1);				/* purecov: inspected */
   }
 
+  /* Apply the IN=>EXISTS transformation to all subqueries and optimize them. */
+  if (select_lex->optimize_unflattened_subqueries())
+    DBUG_RETURN(TRUE);
+
   if (select_lex->inner_refs_list.elements &&
     fix_inner_refs(thd, all_fields, select_lex, select_lex->ref_pointer_array))
     DBUG_RETURN(1);
@@ -659,6 +665,13 @@ int mysql_update(THD *thd,
   if (table->file->ha_table_flags() & HA_PARTIAL_COLUMN_READ)
     table->prepare_for_position();
 
+  /*
+    We can use compare_record() to optimize away updates if
+    the table handler is returning all columns OR if
+    if all updated columns are read
+  */
+  can_compare_record= records_are_comparable(table);
+
   while (!(error=info.read_record(&info)) && !thd->killed)
   {
     update_virtual_fields(thd, table);
@@ -676,7 +689,7 @@ int mysql_update(THD *thd,
 
       found++;
 
-      if (!records_are_comparable(table) || compare_records(table))
+      if (!can_compare_record || compare_record(table))
       {
         if ((res= table_list->view_check_option(thd, ignore)) !=
             VIEW_CHECK_OK)
@@ -924,6 +937,11 @@ int mysql_update(THD *thd,
   }
   thd->count_cuted_fields= CHECK_FIELD_IGNORE;		/* calc cuted fields */
   thd->abort_on_warning= 0;
+  if (thd->lex->current_select->first_cond_optimization)
+  {
+    thd->lex->current_select->save_leaf_tables(thd);
+    thd->lex->current_select->first_cond_optimization= 0;
+  }
   *found_return= found;
   *updated_return= updated;
   DBUG_RETURN((error >= 0 || thd->is_error()) ? 1 : 0);
@@ -973,8 +991,8 @@ bool mysql_prepare_update(THD *thd, TABLE_LIST *table_list,
   if (setup_tables_and_check_access(thd, &select_lex->context, 
                                     &select_lex->top_join_list,
                                     table_list,
-                                    &select_lex->leaf_tables,
-                                    FALSE, UPDATE_ACL, SELECT_ACL) ||
+                                    select_lex->leaf_tables,
+                                    FALSE, UPDATE_ACL, SELECT_ACL, TRUE) ||
       setup_conds(thd, table_list, select_lex->leaf_tables, conds) ||
       select_lex->setup_ref_array(thd, order_num) ||
       setup_order(thd, select_lex->ref_pointer_array,
@@ -1010,8 +1028,8 @@ static table_map get_table_map(List<Item> *items)
   Item_field *item;
   table_map map= 0;
 
-  while ((item= (Item_field *) item_it++)) 
-    map|= item->used_tables();
+  while ((item= (Item_field *) item_it++))
+    map|= item->all_used_tables();
   DBUG_PRINT("info", ("table_map: 0x%08lx", (long) map));
   return map;
 }
@@ -1046,11 +1064,12 @@ static table_map get_table_map(List<Item> *items)
     false  otherwise.
 */
 static
-bool unsafe_key_update(TABLE_LIST *leaves, table_map tables_for_update)
+bool unsafe_key_update(List<TABLE_LIST> leaves, table_map tables_for_update)
 {
-  TABLE_LIST *tl= leaves;
+  List_iterator_fast<TABLE_LIST> it(leaves), it2(leaves);
+  TABLE_LIST *tl, *tl2;
 
-  for (tl= leaves; tl ; tl= tl->next_leaf)
+  while ((tl= it++))
   {
     if (tl->table->map & tables_for_update)
     {
@@ -1066,14 +1085,16 @@ bool unsafe_key_update(TABLE_LIST *leaves, table_map tables_for_update)
       if (!table_partitioned && !primkey_clustered)
         continue;
 
-      for (TABLE_LIST* tl2= tl->next_leaf; tl2 ; tl2= tl2->next_leaf)
+      it2.rewind();
+      while ((tl2= it2++))
       {
         /*
           Look at "next" tables only since all previous tables have
           already been checked
         */
         TABLE *table2= tl2->table;
-        if (table2->map & tables_for_update && table1->s == table2->s)
+        if (tl2 != tl &&
+            table2->map & tables_for_update && table1->s == table2->s)
         {
           // A table is updated through two aliases
           if (table_partitioned &&
@@ -1135,7 +1156,7 @@ int mysql_multi_update_prepare(THD *thd)
 {
   LEX *lex= thd->lex;
   TABLE_LIST *table_list= lex->query_tables;
-  TABLE_LIST *tl, *leaves;
+  TABLE_LIST *tl;
   List<Item> *fields= &lex->select_lex.item_list;
   table_map tables_for_update;
   bool update_view= 0;
@@ -1163,7 +1184,7 @@ int mysql_multi_update_prepare(THD *thd)
        open_tables(thd, &table_list, &table_count,
                    (thd->stmt_arena->is_stmt_prepare() ?
                     MYSQL_OPEN_FORCE_SHARED_MDL : 0))) ||
-      mysql_handle_derived(lex, &mysql_derived_prepare))
+      mysql_handle_derived(lex, DT_INIT))
     DBUG_RETURN(TRUE);
   /*
     setup_tables() need for VIEWs. JOIN::prepare() will call setup_tables()
@@ -1171,11 +1192,20 @@ int mysql_multi_update_prepare(THD *thd)
     call in setup_tables()).
   */
 
+  //We need to merge for insert prior to prepare.
+  if (mysql_handle_derived(lex, DT_MERGE_FOR_INSERT))
+    DBUG_RETURN(TRUE);
+  if (mysql_handle_derived(lex, DT_PREPARE))
+    DBUG_RETURN(TRUE);
+
   if (setup_tables_and_check_access(thd, &lex->select_lex.context,
                                     &lex->select_lex.top_join_list,
                                     table_list,
-                                    &lex->select_lex.leaf_tables, FALSE,
-                                    UPDATE_ACL, SELECT_ACL))
+                                    lex->select_lex.leaf_tables, FALSE,
+                                    UPDATE_ACL, SELECT_ACL, FALSE))
+    DBUG_RETURN(TRUE);
+
+  if (lex->select_lex.handle_derived(thd->lex, DT_MERGE))  
     DBUG_RETURN(TRUE);
 
   if (setup_fields_with_no_wrap(thd, 0, *fields, MARK_COLUMNS_WRITE, 0, 0))
@@ -1197,15 +1227,14 @@ int mysql_multi_update_prepare(THD *thd)
 
   thd->table_map_for_update= tables_for_update= get_table_map(fields);
 
-  leaves= lex->select_lex.leaf_tables;
-
-  if (unsafe_key_update(leaves, tables_for_update))
+  if (unsafe_key_update(lex->select_lex.leaf_tables, tables_for_update))
     DBUG_RETURN(true);
 
   /*
     Setup timestamp handling and locking mode
   */
-  for (tl= leaves; tl; tl= tl->next_leaf)
+  List_iterator<TABLE_LIST> ti(lex->select_lex.leaf_tables);
+  while ((tl= ti++))
   {
     TABLE *table= tl->table;
     /* Only set timestamp column if this is not modified */
@@ -1252,7 +1281,7 @@ int mysql_multi_update_prepare(THD *thd)
   for (tl= table_list; tl; tl= tl->next_local)
   {
     /* Check access privileges for table */
-    if (!tl->derived)
+    if (!tl->is_derived())
     {
       uint want_privilege= tl->updating ? UPDATE_ACL : SELECT_ACL;
       if (check_access(thd, want_privilege, tl->db,
@@ -1267,7 +1296,7 @@ int mysql_multi_update_prepare(THD *thd)
   /* check single table update for view compound from several tables */
   for (tl= table_list; tl; tl= tl->next_local)
   {
-    if (tl->effective_algorithm == VIEW_ALGORITHM_MERGE)
+    if (tl->is_merged_derived())
     {
       TABLE_LIST *for_update= 0;
       if (tl->check_single_table(&for_update, tables_for_update, tl))
@@ -1293,7 +1322,8 @@ int mysql_multi_update_prepare(THD *thd)
   */
   lex->select_lex.exclude_from_table_unique_test= TRUE;
   /* We only need SELECT privilege for columns in the values list */
-  for (tl= leaves; tl; tl= tl->next_leaf)
+  ti.rewind();
+  while ((tl= ti++))
   {
     TABLE *table= tl->table;
     TABLE_LIST *tlist;
@@ -1321,15 +1351,10 @@ int mysql_multi_update_prepare(THD *thd)
     further check in multi_update::prepare whether to use record cache.
   */
   lex->select_lex.exclude_from_table_unique_test= FALSE;
- 
-  if (thd->fill_derived_tables() &&
-      mysql_handle_derived(lex, &mysql_derived_filling))
-  {
-    mysql_handle_derived(lex, &mysql_derived_cleanup);
-    DBUG_RETURN(TRUE);
-  }
-  mysql_handle_derived(lex, &mysql_derived_cleanup);
 
+  if (lex->select_lex.save_prep_leaf_tables(thd))
+    DBUG_RETURN(TRUE);
+ 
   DBUG_RETURN (FALSE);
 }
 
@@ -1354,7 +1379,7 @@ bool mysql_multi_update(THD *thd,
   DBUG_ENTER("mysql_multi_update");
 
   if (!(*result= new multi_update(table_list,
-				 thd->lex->select_lex.leaf_tables,
+				 &thd->lex->select_lex.leaf_tables,
 				 fields, values,
 				 handle_duplicates, ignore)))
   {
@@ -1390,7 +1415,7 @@ bool mysql_multi_update(THD *thd,
 
 
 multi_update::multi_update(TABLE_LIST *table_list,
-			   TABLE_LIST *leaves_list,
+                           List<TABLE_LIST> *leaves_list,
 			   List<Item> *field_list, List<Item> *value_list,
 			   enum enum_duplicates handle_duplicates_arg,
                            bool ignore_arg)
@@ -1408,6 +1433,7 @@ multi_update::multi_update(TABLE_LIST *table_list,
 
 int multi_update::prepare(List<Item> &not_used_values,
 			  SELECT_LEX_UNIT *lex_unit)
+
 {
   TABLE_LIST *table_ref;
   SQL_I_List<TABLE_LIST> update;
@@ -1417,6 +1443,7 @@ int multi_update::prepare(List<Item> &not_used_values,
   List_iterator_fast<Item> value_it(*values);
   uint i, max_fields;
   uint leaf_table_count= 0;
+  List_iterator<TABLE_LIST> ti(*leaves);
   DBUG_ENTER("multi_update::prepare");
 
   thd->count_cuted_fields= CHECK_FIELD_WARN;
@@ -1436,7 +1463,7 @@ int multi_update::prepare(List<Item> &not_used_values,
     TABLE::tmp_set by pointing TABLE::read_set to it and then restore it after
     setup_fields().
   */
-  for (table_ref= leaves; table_ref; table_ref= table_ref->next_leaf)
+  while ((table_ref= ti++))
   {
     TABLE *table= table_ref->table;
     if (tables_to_update & table->map)
@@ -1454,7 +1481,8 @@ int multi_update::prepare(List<Item> &not_used_values,
 
   int error= setup_fields(thd, 0, *values, MARK_COLUMNS_READ, 0, 0);
 
-  for (table_ref= leaves; table_ref; table_ref= table_ref->next_leaf)
+  ti.rewind();
+  while ((table_ref= ti++))
   {
     TABLE *table= table_ref->table;
     if (tables_to_update & table->map)
@@ -1483,7 +1511,8 @@ int multi_update::prepare(List<Item> &not_used_values,
   */
 
   update.empty();
-  for (table_ref= leaves; table_ref; table_ref= table_ref->next_leaf)
+  ti.rewind();
+  while ((table_ref= ti++))
   {
     /* TODO: add support of view of join support */
     TABLE *table=table_ref->table;
@@ -1709,9 +1738,9 @@ loop_end:
     {
       table_map unupdated_tables= table_ref->check_option->used_tables() &
                                   ~first_table_for_update->map;
-      for (TABLE_LIST *tbl_ref =leaves;
-           unupdated_tables && tbl_ref;
-           tbl_ref= tbl_ref->next_leaf)
+      List_iterator<TABLE_LIST> ti(*leaves);
+      TABLE_LIST *tbl_ref;
+      while ((tbl_ref= ti++) && unupdated_tables)
       {
         if (unupdated_tables & tbl_ref->table->map)
           unupdated_tables&= ~tbl_ref->table->map;
@@ -1737,7 +1766,8 @@ loop_end:
     do
     {
       Field_string *field= new Field_string(tbl->file->ref_length, 0,
-                                            tbl->alias, &my_charset_bin);
+                                            tbl->alias.c_ptr(),
+                                            &my_charset_bin);
       if (!field)
         DBUG_RETURN(1);
       field->init(tbl);
@@ -1809,7 +1839,7 @@ multi_update::~multi_update()
 }
 
 
-bool multi_update::send_data(List<Item> &not_used_values)
+int multi_update::send_data(List<Item> &not_used_values)
 {
   TABLE_LIST *cur_table;
   DBUG_ENTER("multi_update::send_data");
@@ -1835,6 +1865,14 @@ bool multi_update::send_data(List<Item> &not_used_values)
 
     if (table == table_to_update)
     {
+      /*
+        We can use compare_record() to optimize away updates if
+        the table handler is returning all columns OR if
+        if all updated columns are read
+      */
+      bool can_compare_record;
+      can_compare_record= records_are_comparable(table);
+
       table->status|= STATUS_UPDATED;
       store_record(table,record[1]);
       if (fill_record_n_invoke_before_triggers(thd, *fields_for_table[offset],
@@ -1849,7 +1887,7 @@ bool multi_update::send_data(List<Item> &not_used_values)
       */
       table->auto_increment_field_not_null= FALSE;
       found++;
-      if (!records_are_comparable(table) || compare_records(table))
+      if (!can_compare_record || compare_record(table))
       {
 	int error;
         if ((error= cur_table->view_check_option(thd, ignore)) !=
@@ -1946,7 +1984,7 @@ bool multi_update::send_data(List<Item> &not_used_values)
                   *values_for_table[offset], TRUE, FALSE);
 
       /* Write row, ignoring duplicated updates to a row */
-      error= tmp_table->file->ha_write_row(tmp_table->record[0]);
+      error= tmp_table->file->ha_write_tmp_row(tmp_table->record[0]);
       if (error != HA_ERR_FOUND_DUPP_KEY && error != HA_ERR_FOUND_DUPP_UNIQUE)
       {
         if (error &&
@@ -2040,6 +2078,7 @@ int multi_update::do_updates()
     DBUG_RETURN(0);
   for (cur_table= update_tables; cur_table; cur_table= cur_table->next_local)
   {
+    bool can_compare_record;
     uint offset= cur_table->shared;
 
     table = cur_table->table;
@@ -2088,6 +2127,8 @@ int multi_update::do_updates()
       goto err;
     }
 
+    can_compare_record= records_are_comparable(table);
+
     for (;;)
     {
       if (thd->killed && trans_safe)
@@ -2135,7 +2176,7 @@ int multi_update::do_updates()
                                             TRG_ACTION_BEFORE, TRUE))
         goto err2;
 
-      if (!records_are_comparable(table) || compare_records(table))
+      if (!can_compare_record || compare_record(table))
       {
         int error;
         if ((error= cur_table->view_check_option(thd, ignore)) !=
diff --git a/sql/sql_update.h b/sql/sql_update.h
index 50ff50f025d..9552ff0ab2d 100644
--- a/sql/sql_update.h
+++ b/sql/sql_update.h
@@ -39,6 +39,6 @@ bool mysql_multi_update(THD *thd, TABLE_LIST *table_list,
                         SELECT_LEX_UNIT *unit, SELECT_LEX *select_lex,
                         multi_update **result);
 bool records_are_comparable(const TABLE *table);
-bool compare_records(const TABLE *table);
+bool compare_record(const TABLE *table);
 
 #endif /* SQL_UPDATE_INCLUDED */
diff --git a/sql/sql_view.cc b/sql/sql_view.cc
index 0a8c2eac1a0..cb1c57ea0ba 100644
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2004, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2004, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -228,7 +229,7 @@ fill_defined_view_parts (THD *thd, TABLE_LIST *view)
     view->definer.user= decoy.definer.user;
     lex->definer= &view->definer;
   }
-  if (lex->create_view_algorithm == VIEW_ALGORITHM_UNDEFINED)
+  if (lex->create_view_algorithm == DTYPE_ALGORITHM_UNDEFINED)
     lex->create_view_algorithm= (uint8) decoy.algorithm;
   if (lex->create_view_suid == VIEW_SUID_DEFAULT)
     lex->create_view_suid= decoy.view_suid ? 
@@ -840,14 +841,13 @@ static int mysql_register_view(THD *thd, TABLE_LIST *view,
     ulong sql_mode= thd->variables.sql_mode & MODE_ANSI_QUOTES;
     thd->variables.sql_mode&= ~MODE_ANSI_QUOTES;
 
-    lex->unit.print(&view_query, QT_ORDINARY);
+    lex->unit.print(&view_query, QT_VIEW_INTERNAL);
     lex->unit.print(&is_query,
                     enum_query_type(QT_TO_SYSTEM_CHARSET | QT_WITHOUT_INTRODUCERS));
 
     thd->variables.sql_mode|= sql_mode;
   }
-  DBUG_PRINT("info",
-             ("View: %*.s", (int) view_query.length(), view_query.ptr()));
+  DBUG_PRINT("info", ("View: %s", view_query.c_ptr_safe()));
 
   /* fill structure */
   view->source= thd->lex->create_view_select;
@@ -875,7 +875,7 @@ static int mysql_register_view(THD *thd, TABLE_LIST *view,
   {
     push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_WARN_VIEW_MERGE,
                  ER(ER_WARN_VIEW_MERGE));
-    lex->create_view_algorithm= VIEW_ALGORITHM_UNDEFINED;
+    lex->create_view_algorithm= DTYPE_ALGORITHM_UNDEFINED;
   }
   view->algorithm= lex->create_view_algorithm;
   view->definer.user= lex->definer->user;
@@ -1481,7 +1481,7 @@ bool mysql_make_view(THD *thd, File_parser *parser, TABLE_LIST *table,
 
       List_iterator_fast<TABLE_LIST> ti(view_select->top_join_list);
 
-      table->effective_algorithm= VIEW_ALGORITHM_MERGE;
+      table->derived_type= VIEW_ALGORITHM_MERGE;
       DBUG_PRINT("info", ("algorithm: MERGE"));
       table->updatable= (table->updatable_view != 0);
       table->effective_with_check=
@@ -1495,74 +1495,31 @@ bool mysql_make_view(THD *thd, File_parser *parser, TABLE_LIST *table,
       /* prepare view context */
       lex->select_lex.context.resolve_in_table_list_only(view_main_select_tables);
       lex->select_lex.context.outer_context= 0;
-      lex->select_lex.context.select_lex= table->select_lex;
       lex->select_lex.select_n_having_items+=
         table->select_lex->select_n_having_items;
 
-      /*
-        Tables of the main select of the view should be marked as belonging
-        to the same select as original view (again we can use LEX::select_lex
-        for this purprose because we don't support MERGE algorithm for views
-        with unions).
-      */
-      for (tbl= lex->select_lex.get_table_list(); tbl; tbl= tbl->next_local)
-        tbl->select_lex= table->select_lex;
-
-      {
-        if (view_main_select_tables->next_local)
-        {
-          table->multitable_view= TRUE;
-          if (table->belong_to_view)
-           table->belong_to_view->multitable_view= TRUE;
-        }
-        /* make nested join structure for view tables */
-        NESTED_JOIN *nested_join;
-        if (!(nested_join= table->nested_join=
-              (NESTED_JOIN *) thd->calloc(sizeof(NESTED_JOIN))))
-          goto err;
-        nested_join->join_list= view_select->top_join_list;
-
-        /* re-nest tables of VIEW */
-        ti.rewind();
-        while ((tbl= ti++))
-        {
-          tbl->join_list= &nested_join->join_list;
-          tbl->embedding= table;
-        }
-      }
-
-      /* Store WHERE clause for post-processing in setup_underlying */
       table->where= view_select->where;
-      /*
-        Add subqueries units to SELECT into which we merging current view.
-        unit(->next)* chain starts with subqueries that are used by this
-        view and continues with subqueries that are used by other views.
-        We must not add any subquery twice (otherwise we'll form a loop),
-        to do this we remember in end_unit the first subquery that has
-        been already added.
-
-        NOTE: we do not support UNION here, so we take only one select
-      */
-      SELECT_LEX_NODE *end_unit= table->select_lex->slave;
-      SELECT_LEX_UNIT *next_unit;
-      for (SELECT_LEX_UNIT *unit= lex->select_lex.first_inner_unit();
-           unit;
-           unit= next_unit)
-      {
-        if (unit == end_unit)
-          break;
-        SELECT_LEX_NODE *save_slave= unit->slave;
-        next_unit= unit->next_unit();
-        unit->include_down(table->select_lex);
-        unit->slave= save_slave; // fix include_down initialisation
-      }
 
       /* 
         We can safely ignore the VIEW's ORDER BY if we merge into union 
         branch, as order is not important there.
       */
-      if (!table->select_lex->master_unit()->is_union())
+      if (!table->select_lex->master_unit()->is_union() &&
+          table->select_lex->order_list.elements == 0)
         table->select_lex->order_list.push_back(&lex->select_lex.order_list);
+      else
+      {
+        if (old_lex->sql_command == SQLCOM_SELECT &&
+            (old_lex->describe & DESCRIBE_EXTENDED) &&
+            lex->select_lex.order_list.elements &&
+            !table->select_lex->master_unit()->is_union())
+        {
+          push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+                              ER_VIEW_ORDERBY_IGNORED,
+                              ER(ER_VIEW_ORDERBY_IGNORED),
+                              table->db, table->table_name);
+        }
+      }
       /*
 	This SELECT_LEX will be linked in global SELECT_LEX list
 	to make it processed by mysql_handle_derived(),
@@ -1572,23 +1529,22 @@ bool mysql_make_view(THD *thd, File_parser *parser, TABLE_LIST *table,
       goto ok;
     }
 
-    table->effective_algorithm= VIEW_ALGORITHM_TMPTABLE;
+    table->derived_type= VIEW_ALGORITHM_TMPTABLE;
     DBUG_PRINT("info", ("algorithm: TEMPORARY TABLE"));
     view_select->linkage= DERIVED_TABLE_TYPE;
     table->updatable= 0;
     table->effective_with_check= VIEW_CHECK_NONE;
     old_lex->subqueries= TRUE;
 
-    /* SELECT tree link */
-    lex->unit.include_down(table->select_lex);
-    lex->unit.slave= view_select; // fix include_down initialisation
-
     table->derived= &lex->unit;
   }
   else
     goto err;
 
 ok:
+  /* SELECT tree link */
+  lex->unit.include_down(table->select_lex);
+  lex->unit.slave= view_select; // fix include_down initialisation
   /* global SELECT list linking */
   end= view_select;	// primary SELECT_LEX is always last
   end->link_next= old_lex->all_selects_list;
@@ -1718,7 +1674,7 @@ bool mysql_drop_view(THD *thd, TABLE_LIST *views, enum_drop_mode drop_mode)
   }
   if (non_existant_views.length())
   {
-    my_error(ER_BAD_TABLE_ERROR, MYF(0), non_existant_views.c_ptr());
+    my_error(ER_BAD_TABLE_ERROR, MYF(0), non_existant_views.c_ptr_safe());
   }
 
   something_wrong= error || wrong_object_name || non_existant_views.length();
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index e6246b42e18..93e2729c9fe 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -1,4 +1,5 @@
 /* Copyright (c) 2000, 2011 Oracle and/or its affiliates. All rights reserved.
+   Copyright (c) 2010, 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 #define YYINITDEPTH 100
 #define YYMAXDEPTH 3200                        /* Because of 64K stack */
 #define Lex (YYTHD->lex)
+
 #define Select Lex->current_select
 #include "sql_priv.h"
 #include "unireg.h"                    // REQUIRED: for other includes
@@ -149,7 +151,7 @@ void my_parse_error(const char *s)
     yytext= "";
 
   /* Push an error into the error stack */
-  ErrConvString err(yytext, thd->variables.character_set_client);
+  ErrConvString err(yytext, strlen(yytext), thd->variables.character_set_client);
   my_printf_error(ER_PARSE_ERROR,  ER(ER_PARSE_ERROR), MYF(0), s,
                   err.ptr(), lip->yylineno);
 }
@@ -775,6 +777,8 @@ static bool add_create_index (LEX *lex, Key::Keytype type,
   enum Foreign_key::fk_option m_fk_option;
   enum enum_yes_no_unknown m_yes_no_unk;
   Diag_condition_item_name diag_condition_item_name;
+  DYNCALL_CREATE_DEF *dyncol_def;
+  List<DYNCALL_CREATE_DEF> *dyncol_def_list;
 }
 
 %{
@@ -783,10 +787,10 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 
 %pure_parser                                    /* We have threads */
 /*
-  Currently there are 168 shift/reduce conflicts.
+  Currently there are 171 shift/reduce conflicts.
   We should not introduce new conflicts any more.
 */
-%expect 168
+%expect 171
 
 /*
    Comments for TOKENS.
@@ -863,6 +867,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 %token  CHANGED
 %token  CHARSET
 %token  CHAR_SYM                      /* SQL-2003-R */
+%token  CHECKPOINT_SYM
 %token  CHECKSUM_SYM
 %token  CHECK_SYM                     /* SQL-2003-R */
 %token  CIPHER_SYM
@@ -875,6 +880,12 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 %token  COLLATE_SYM                   /* SQL-2003-R */
 %token  COLLATION_SYM                 /* SQL-2003-N */
 %token  COLUMNS
+%token  COLUMN_ADD_SYM
+%token  COLUMN_CREATE_SYM
+%token  COLUMN_DELETE_SYM
+%token  COLUMN_EXISTS_SYM
+%token  COLUMN_GET_SYM
+%token  COLUMN_LIST_SYM
 %token  COLUMN_SYM                    /* SQL-2003-R */
 %token  COLUMN_NAME_SYM               /* SQL-2003-N */
 %token  COMMENT_SYM
@@ -1156,6 +1167,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 %token  ON                            /* SQL-2003-R */
 %token  ONE_SHOT_SYM
 %token  ONE_SYM
+%token  ONLINE_SYM
 %token  OPEN_SYM                      /* SQL-2003-R */
 %token  OPTIMIZE
 %token  OPTIONS_SYM
@@ -1408,6 +1420,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 %token  YEAR_SYM                      /* SQL-2003-R */
 %token  ZEROFILL
 
+%token IMPOSSIBLE_ACTION		/* To avoid warning for yyerrlab1 */
+
 %left   JOIN_SYM INNER_SYM STRAIGHT_JOIN CROSS LEFT RIGHT
 /* A dummy token to force the priority of table_ref production in a join. */
 %left   TABLE_REF_PRIORITY
@@ -1460,6 +1474,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
         opt_natural_language_mode opt_query_expansion
         opt_ev_status opt_ev_on_completion ev_on_completion opt_ev_comment
         ev_alter_on_schedule_completion opt_ev_rename_to opt_ev_sql_stmt
+        optional_flush_tables_arguments opt_dyncol_type dyncol_type
+        opt_time_precision
 
 %type <m_yes_no_unk>
         opt_chain opt_release
@@ -1563,6 +1579,10 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
 
 %type <boolfunc2creator> comp_op
 
+%type <dyncol_def> dyncall_create_element
+
+%type <dyncol_def_list> dyncall_create_list
+
 %type <NONE>
         query verb_clause create change select do drop insert replace insert2
         insert_values update delete truncate rename
@@ -1613,6 +1633,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize);
         init_key_options normal_key_options normal_key_opts all_key_opt 
         spatial_key_options fulltext_key_options normal_key_opt 
         fulltext_key_opt spatial_key_opt fulltext_key_opts spatial_key_opts
+	keep_gcc_happy
         key_using_alg
         part_column_list
         server_def server_options_list server_option
@@ -1748,11 +1769,12 @@ statement:
         | help
         | insert
         | install
+	| keep_gcc_happy
+        | keycache
         | kill
         | load
         | lock
         | optimize
-        | keycache
         | parse_vcol_expr
         | partition_entry
         | preload
@@ -2117,7 +2139,7 @@ create:
         | CREATE
           {
             Lex->create_view_mode= VIEW_CREATE_NEW;
-            Lex->create_view_algorithm= VIEW_ALGORITHM_UNDEFINED;
+            Lex->create_view_algorithm= DTYPE_ALGORITHM_UNDEFINED;
             Lex->create_view_suid= TRUE;
           }
           view_or_trigger_or_sp_or_event
@@ -2259,7 +2281,7 @@ opt_ev_status:
 ev_starts:
           /* empty */
           {
-            Item *item= new (YYTHD->mem_root) Item_func_now_local();
+            Item *item= new (YYTHD->mem_root) Item_func_now_local(0);
             if (item == NULL)
               MYSQL_YYABORT;
             Lex->event_parse_data->item_starts= item;
@@ -4945,6 +4967,12 @@ opt_part_option:
             part_info->curr_part_elem->engine_type= $4;
             part_info->default_engine_type= $4;
           }
+        | CONNECTION_SYM opt_equal TEXT_STRING_sys
+          {
+            LEX *lex= Lex;
+            lex->part_info->curr_part_elem->connect_string.str= $3.str;
+            lex->part_info->curr_part_elem->connect_string.length= $3.length;
+          }
         | NODEGROUP_SYM opt_equal real_ulong_num
           { Lex->part_info->curr_part_elem->nodegroup_id= (uint16) $3; }
         | MAX_ROWS opt_equal real_ulonglong_num
@@ -5610,9 +5638,9 @@ type:
           { $$=MYSQL_TYPE_YEAR; }
         | DATE_SYM
           { $$=MYSQL_TYPE_DATE; }
-        | TIME_SYM
+        | TIME_SYM opt_field_length
           { $$=MYSQL_TYPE_TIME; }
-        | TIMESTAMP
+        | TIMESTAMP opt_field_length
           {
             if (YYTHD->variables.sql_mode & MODE_MAXDB)
               $$=MYSQL_TYPE_DATETIME;
@@ -5625,7 +5653,7 @@ type:
               $$=MYSQL_TYPE_TIMESTAMP;
             }
           }
-        | DATETIME
+        | DATETIME opt_field_length
           { $$=MYSQL_TYPE_DATETIME; }
         | TINYBLOB
           {
@@ -5819,9 +5847,9 @@ attribute:
           NULL_SYM { Lex->type&= ~ NOT_NULL_FLAG; }
         | not NULL_SYM { Lex->type|= NOT_NULL_FLAG; }
         | DEFAULT now_or_signed_literal { Lex->default_value=$2; }
-        | ON UPDATE_SYM NOW_SYM optional_braces
+        | ON UPDATE_SYM NOW_SYM opt_time_precision
           {
-            Item *item= new (YYTHD->mem_root) Item_func_now_local();
+            Item *item= new (YYTHD->mem_root) Item_func_now_local($4);
             if (item == NULL)
               MYSQL_YYABORT;
             Lex->on_update_value= item;
@@ -5913,9 +5941,9 @@ type_with_opt_collate:
 
 
 now_or_signed_literal:
-          NOW_SYM optional_braces
+          NOW_SYM opt_time_precision
           {
-            $$= new (YYTHD->mem_root) Item_func_now_local();
+            $$= new (YYTHD->mem_root) Item_func_now_local($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
@@ -6355,7 +6383,7 @@ string_list:
 */
 
 alter:
-          ALTER opt_ignore TABLE_SYM table_ident
+          ALTER alter_options TABLE_SYM table_ident
           {
             THD *thd= YYTHD;
             LEX *lex= thd->lex;
@@ -6481,7 +6509,7 @@ alter:
               my_error(ER_SP_BADSTATEMENT, MYF(0), "ALTER VIEW");
               MYSQL_YYABORT;
             }
-            lex->create_view_algorithm= VIEW_ALGORITHM_UNDEFINED;
+            lex->create_view_algorithm= DTYPE_ALGORITHM_UNDEFINED;
             lex->create_view_mode= VIEW_ALTER;
           }
           view_tail
@@ -6971,6 +6999,25 @@ opt_ignore:
         | IGNORE_SYM { Lex->ignore= 1;}
         ;
 
+alter_options:
+        { Lex->ignore= Lex->online= 0;} alter_options_part2
+	;
+	
+alter_options_part2:
+          /* empty */ 
+        | alter_option_list
+        ;
+
+alter_option_list:
+        alter_option_list alter_option
+        | alter_option
+        ;
+
+alter_option:
+	  IGNORE_SYM { Lex->ignore= 1;}
+        | ONLINE_SYM { Lex->online= 1;}
+
+
 opt_restrict:
           /* empty */ { Lex->drop_mode= DROP_DEFAULT; }
         | RESTRICT    { Lex->drop_mode= DROP_RESTRICT; }
@@ -7664,6 +7711,12 @@ select_alias:
         | TEXT_STRING_sys { $$=$1; }
         ;
 
+opt_time_precision:
+          /* empty */             { $$= 0;  }
+        | '(' ')'                 { $$= 0;  }
+        | '(' real_ulong_num ')'  { $$= $2; };
+        ;
+
 optional_braces:
           /* empty */ {}
         | '(' ')' {}
@@ -7725,7 +7778,7 @@ expr:
         | expr XOR expr %prec XOR
           {
             /* XOR is a proprietary extension */
-            $$ = new (YYTHD->mem_root) Item_cond_xor($1, $3);
+            $$ = new (YYTHD->mem_root) Item_func_xor($1, $3);
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
@@ -8079,6 +8132,133 @@ all_or_any:
         | ANY_SYM { $$ = 0; }
         ;
 
+opt_dyncol_type:
+          /* empty */ 
+          {
+            LEX *lex= Lex;
+	    $$= DYN_COL_NULL; /* automatic type */
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+	  }
+        | AS dyncol_type { $$= $2; }
+        ;
+
+dyncol_type:
+          INT_SYM
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_INT;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | UNSIGNED INT_SYM 
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_UINT;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | DOUBLE_SYM
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_DOUBLE;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | REAL
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_DOUBLE;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | FLOAT_SYM
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_DOUBLE;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | DECIMAL_SYM float_options
+          {
+            $$= DYN_COL_DECIMAL;
+            Lex->charset= NULL;
+          }
+        | char opt_binary
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_STRING;
+            lex->length= lex->dec= 0;
+          }
+        | nchar
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_STRING;
+            lex->charset= national_charset_info;
+            lex->length= lex->dec= 0;
+          }
+        | DATE_SYM
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_DATE;
+            lex->charset= NULL;
+            lex->length= lex->dec= 0;
+          }
+        | TIME_SYM opt_field_length
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_TIME;
+            lex->charset= NULL;
+            lex->dec= lex->length;
+            lex->length= 0;
+          }
+        | DATETIME opt_field_length
+          {
+            LEX *lex= Lex;
+            $$= DYN_COL_DATETIME;
+            lex->charset= NULL;
+            lex->dec= lex->length;
+            lex->length= 0;
+          }
+        ;
+
+dyncall_create_element:
+   expr ',' expr opt_dyncol_type
+   {
+     LEX *lex= Lex;
+     $$= (DYNCALL_CREATE_DEF *)
+       alloc_root(YYTHD->mem_root, sizeof(DYNCALL_CREATE_DEF));
+     if ($$ == NULL)
+       MYSQL_YYABORT;
+     $$->num= $1;
+     $$->value= $3;
+     $$->type= (DYNAMIC_COLUMN_TYPE)$4;
+     $$->cs= lex->charset;
+     if (lex->length)
+       $$->len= strtoul(lex->length, NULL, 10);
+     else
+       $$->len= 0;
+     if (lex->dec)
+       $$->frac= strtoul(lex->dec, NULL, 10);
+     else
+       $$->len= 0;
+   }
+
+dyncall_create_list:
+     dyncall_create_element
+       {
+         $$= new (YYTHD->mem_root) List<DYNCALL_CREATE_DEF>;
+         if ($$ == NULL)
+           MYSQL_YYABORT;
+         $$->push_back($1);
+       }
+   | dyncall_create_list ',' dyncall_create_element
+       {
+         $1->push_back($3);
+         $$= $1;
+       }
+   ;
+
 simple_expr:
           simple_ident
         | function_call_keyword
@@ -8342,13 +8522,13 @@ function_call_keyword:
           }
         | TIME_SYM '(' expr ')'
           {
-            $$= new (YYTHD->mem_root) Item_time_typecast($3);
+            $$= new (YYTHD->mem_root) Item_time_typecast($3, AUTO_SEC_PART_DIGITS);
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
         | TIMESTAMP '(' expr ')'
           {
-            $$= new (YYTHD->mem_root) Item_datetime_typecast($3);
+            $$= new (YYTHD->mem_root) Item_datetime_typecast($3, AUTO_SEC_PART_DIGITS);
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
@@ -8455,16 +8635,9 @@ function_call_nonkeyword:
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
           }
-        | CURTIME optional_braces
+        | CURTIME opt_time_precision
           {
-            $$= new (YYTHD->mem_root) Item_func_curtime_local();
-            if ($$ == NULL)
-              MYSQL_YYABORT;
-            Lex->safe_to_cache_query=0;
-          }
-        | CURTIME '(' expr ')'
-          {
-            $$= new (YYTHD->mem_root) Item_func_curtime_local($3);
+            $$= new (YYTHD->mem_root) Item_func_curtime_local($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
@@ -8495,16 +8668,9 @@ function_call_nonkeyword:
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
-        | NOW_SYM optional_braces
-          {
-            $$= new (YYTHD->mem_root) Item_func_now_local();
-            if ($$ == NULL)
-              MYSQL_YYABORT;
-            Lex->safe_to_cache_query=0;
-          }
-        | NOW_SYM '(' expr ')'
+        | NOW_SYM opt_time_precision
           {
-            $$= new (YYTHD->mem_root) Item_func_now_local($3);
+            $$= new (YYTHD->mem_root) Item_func_now_local($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
@@ -8552,7 +8718,7 @@ function_call_nonkeyword:
             if ($$ == NULL)
               MYSQL_YYABORT;
           }
-        | SYSDATE optional_braces
+        | SYSDATE opt_time_precision
           {
             /*
               Unlike other time-related functions, SYSDATE() is
@@ -8563,19 +8729,9 @@ function_call_nonkeyword:
             */
             Lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_SYSTEM_FUNCTION);
             if (global_system_variables.sysdate_is_now == 0)
-              $$= new (YYTHD->mem_root) Item_func_sysdate_local();
+              $$= new (YYTHD->mem_root) Item_func_sysdate_local($2);
             else
-              $$= new (YYTHD->mem_root) Item_func_now_local();
-            if ($$ == NULL)
-              MYSQL_YYABORT;
-            Lex->safe_to_cache_query=0;
-          }
-        | SYSDATE '(' expr ')'
-          {
-            if (global_system_variables.sysdate_is_now == 0)
-              $$= new (YYTHD->mem_root) Item_func_sysdate_local($3);
-            else
-              $$= new (YYTHD->mem_root) Item_func_now_local($3);
+              $$= new (YYTHD->mem_root) Item_func_now_local($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
@@ -8599,20 +8755,65 @@ function_call_nonkeyword:
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
           }
-        | UTC_TIME_SYM optional_braces
+        | UTC_TIME_SYM opt_time_precision
           {
-            $$= new (YYTHD->mem_root) Item_func_curtime_utc();
+            $$= new (YYTHD->mem_root) Item_func_curtime_utc($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
           }
-        | UTC_TIMESTAMP_SYM optional_braces
+        | UTC_TIMESTAMP_SYM opt_time_precision
           {
-            $$= new (YYTHD->mem_root) Item_func_now_utc();
+            $$= new (YYTHD->mem_root) Item_func_now_utc($2);
             if ($$ == NULL)
               MYSQL_YYABORT;
             Lex->safe_to_cache_query=0;
           }
+        |
+          COLUMN_ADD_SYM '(' expr ',' dyncall_create_list ')'
+          {
+            $$= create_func_dyncol_add(YYTHD, $3, *$5);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
+        |
+          COLUMN_DELETE_SYM '(' expr ',' expr_list ')'
+          {
+            $$= create_func_dyncol_delete(YYTHD, $3, *$5);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
+        |
+          COLUMN_EXISTS_SYM '(' expr ',' expr ')'
+          {
+            $$= new (YYTHD->mem_root) Item_func_dyncol_exists($3, $5);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
+        |
+          COLUMN_LIST_SYM '(' expr ')'
+          {
+            $$= new (YYTHD->mem_root) Item_func_dyncol_list($3);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
+        |
+          COLUMN_CREATE_SYM '(' dyncall_create_list ')'
+          {
+            $$= create_func_dyncol_create(YYTHD, *$3);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
+        |
+          COLUMN_GET_SYM '(' expr ',' expr AS cast_type ')'
+          {
+            LEX *lex= Lex;
+            $$= create_func_dyncol_get(YYTHD, $3, $5, $7,
+                                        lex->length, lex->dec,
+                                        lex->charset);
+            if ($$ == NULL)
+              MYSQL_YYABORT;
+          }
         ;
 
 /*
@@ -9214,6 +9415,8 @@ cast_type:
           { $$=ITEM_CAST_CHAR; Lex->dec= 0; }
         | NCHAR_SYM opt_field_length
           { $$=ITEM_CAST_CHAR; Lex->charset= national_charset_info; Lex->dec=0; }
+        | INT_SYM
+          { $$=ITEM_CAST_SIGNED_INT; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
         | SIGNED_SYM
           { $$=ITEM_CAST_SIGNED_INT; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
         | SIGNED_SYM INT_SYM
@@ -9224,13 +9427,24 @@ cast_type:
           { $$=ITEM_CAST_UNSIGNED_INT; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
         | DATE_SYM
           { $$=ITEM_CAST_DATE; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
-        | TIME_SYM
-          { $$=ITEM_CAST_TIME; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
-        | DATETIME
-          { $$=ITEM_CAST_DATETIME; Lex->charset= NULL; Lex->dec=Lex->length= (char*)0; }
+        | TIME_SYM opt_field_length
+          {
+            $$=ITEM_CAST_TIME;
+            LEX *lex= Lex;
+            lex->charset= NULL; lex->dec= lex->length; lex->length= (char*)0;
+           }
+        | DATETIME opt_field_length
+          {
+            $$=ITEM_CAST_DATETIME;
+            LEX *lex= Lex;
+            lex->charset= NULL; lex->dec= lex->length; lex->length= (char*)0;
+           }
         | DECIMAL_SYM float_options
           { $$=ITEM_CAST_DECIMAL; Lex->charset= NULL; }
-        ;
+        | DOUBLE_SYM
+          { Lex->charset= NULL; Lex->length= Lex->dec= 0;}
+          opt_precision
+          { $$=ITEM_CAST_DOUBLE; }
 
 opt_expr_list:
           /* empty */ { $$= NULL; }
@@ -9736,7 +9950,7 @@ opt_outer:
 index_hint_clause:
           /* empty */
           {
-            $$= old_mode ?  INDEX_HINT_MASK_JOIN : INDEX_HINT_MASK_ALL; 
+            $$= YYTHD->variables.old_mode ?  INDEX_HINT_MASK_JOIN : INDEX_HINT_MASK_ALL; 
           }
         | FOR_SYM JOIN_SYM      { $$= INDEX_HINT_MASK_JOIN;  }
         | FOR_SYM ORDER_SYM BY  { $$= INDEX_HINT_MASK_ORDER; }
@@ -9881,7 +10095,7 @@ where_clause:
           expr
           {
             SELECT_LEX *select= Select;
-            select->where= $3;
+            select->where= normalize_cond($3);
             select->parsing_place= NO_MATTER;
             if ($3)
               $3->top_level_item();
@@ -9897,7 +10111,7 @@ having_clause:
           expr
           {
             SELECT_LEX *sel= Select;
-            sel->having= $3;
+            sel->having= normalize_cond($3);
             sel->parsing_place= NO_MATTER;
             if ($3)
               $3->top_level_item();
@@ -11402,7 +11616,7 @@ wild_and_where:
           }
         | WHERE expr
           {
-            Select->where= $2;
+            Select->where= normalize_cond($2);
             if ($2)
               $2->top_level_item();
           }
@@ -11488,10 +11702,10 @@ flush_options:
 
 opt_with_read_lock:
           /* empty */ {}
-        | WITH READ_SYM LOCK_SYM
+        | WITH READ_SYM LOCK_SYM optional_flush_tables_arguments
           {
             TABLE_LIST *tables= Lex->query_tables;
-            Lex->type|= REFRESH_READ_LOCK;
+            Lex->type|= REFRESH_READ_LOCK | $4;
             for (; tables; tables= tables->next_global)
             {
               tables->mdl_request.set_type(MDL_SHARED_NO_WRITE);
@@ -11553,6 +11767,10 @@ opt_table_list:
         | table_list {}
         ;
 
+optional_flush_tables_arguments:
+          /* empty */        {$$= 0;}
+        | AND_SYM DISABLE_SYM CHECKPOINT_SYM {$$= REFRESH_CHECKPOINT; } 
+
 reset:
           RESET_SYM
           {
@@ -12599,7 +12817,14 @@ keyword:
         | CACHE_SYM             {}
         | CHARSET               {}
         | CHECKSUM_SYM          {}
+        | CHECKPOINT_SYM        {}
         | CLOSE_SYM             {}
+        | COLUMN_ADD_SYM        {}
+        | COLUMN_CREATE_SYM     {}
+        | COLUMN_DELETE_SYM     {}
+        | COLUMN_EXISTS_SYM     {}
+        | COLUMN_GET_SYM        {}
+        | COLUMN_LIST_SYM       {}
         | COMMENT_SYM           {}
         | COMMIT_SYM            {}
         | CONTAINS_SYM          {}
@@ -12820,6 +13045,7 @@ keyword_sp:
         | OLD_PASSWORD             {}
         | ONE_SHOT_SYM             {}
         | ONE_SYM                  {}
+        | ONLINE_SYM               {}
         | PACK_KEYS_SYM            {}
         | PAGE_SYM                 {}
         | PARTIAL                  {}
@@ -13915,7 +14141,7 @@ column_list_id:
             while ((point=iter++))
             {
               if (!my_strcasecmp(system_charset_info,
-                                 point->column.ptr(), new_str->ptr()))
+                                 point->column.c_ptr(), new_str->c_ptr()))
                 break;
             }
             lex->grant_tot_col|= lex->which_columns;
@@ -14347,7 +14573,7 @@ view_replace:
 
 view_algorithm:
           ALGORITHM_SYM EQ UNDEFINED_SYM
-          { Lex->create_view_algorithm= VIEW_ALGORITHM_UNDEFINED; }
+          { Lex->create_view_algorithm= DTYPE_ALGORITHM_UNDEFINED; }
         | ALGORITHM_SYM EQ MERGE_SYM
           { Lex->create_view_algorithm= VIEW_ALGORITHM_MERGE; }
         | ALGORITHM_SYM EQ TEMPTABLE_SYM
@@ -14863,6 +15089,13 @@ uninstall:
           }
         ;
 
+/* Avoid compiler warning from sql_yacc.cc where yyerrlab1 is not used */
+keep_gcc_happy:
+	IMPOSSIBLE_ACTION
+	{
+	  YYERROR;
+	}
+
 /**
   @} (end of group Parser)
 */
diff --git a/sql/structs.h b/sql/structs.h
index fb2f2f6fec8..0d0b32e3d41 100644
--- a/sql/structs.h
+++ b/sql/structs.h
@@ -83,12 +83,12 @@ typedef struct st_key_part_info {	/* Info about a key part */
 } KEY_PART_INFO ;
 
 class engine_option_value;
+struct ha_index_option_struct;
 
 typedef struct st_key {
   uint	key_length;			/* Tot length of key */
   ulong flags;                          /* dupp key and pack flags */
   uint	key_parts;			/* How many key_parts */
-  uint  extra_length;
   uint	usable_key_parts;		/* Should normally be = key_parts */
   uint  block_size;
   uint  name_length;
@@ -119,7 +119,7 @@ typedef struct st_key {
   LEX_STRING comment;
   /** reference to the list of options or NULL */
   engine_option_value *option_list;
-  void *option_struct;                  /* structure with parsed options */
+  ha_index_option_struct *option_struct;                  /* structure with parsed options */
 } KEY;
 
 
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index 8126ae091f2..b6981ea08bd 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -706,7 +706,7 @@ static Sys_var_ulong Sys_flush_time(
        "given interval",
        GLOBAL_VAR(flush_time),
        CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, LONG_TIMEOUT),
-       DEFAULT(FLUSH_TIME), BLOCK_SIZE(1));
+       DEFAULT(0), BLOCK_SIZE(1));
 
 static bool check_ftb_syntax(sys_var *self, THD *thd, set_var *var)
 {
@@ -776,8 +776,8 @@ static bool check_init_string(sys_var *self, THD *thd, set_var *var)
 static PolyLock_rwlock PLock_sys_init_connect(&LOCK_sys_init_connect);
 static Sys_var_lexstring Sys_init_connect(
        "init_connect", "Command(s) that are executed for each "
-       "new connection", GLOBAL_VAR(opt_init_connect),
-       CMD_LINE(REQUIRED_ARG), IN_SYSTEM_CHARSET,
+       "new connection (unless the user has SUPER privilege)",
+       GLOBAL_VAR(opt_init_connect), CMD_LINE(REQUIRED_ARG), IN_SYSTEM_CHARSET,
        DEFAULT(""), &PLock_sys_init_connect, NOT_IN_BINLOG,
        ON_CHECK(check_init_string));
 
@@ -809,7 +809,7 @@ static Sys_var_ulong Sys_interactive_timeout(
 
 static Sys_var_ulong Sys_join_buffer_size(
        "join_buffer_size",
-       "The size of the buffer that is used for full joins",
+       "The size of the buffer that is used for joins",
        SESSION_VAR(join_buff_size), CMD_LINE(REQUIRED_ARG),
        VALID_RANGE(128, ULONG_MAX), DEFAULT(128*1024), BLOCK_SIZE(128));
 
@@ -910,7 +910,10 @@ static Sys_var_mybool Sys_trust_function_creators(
        CMD_LINE(OPT_ARG), DEFAULT(FALSE));
 
 static Sys_var_charptr Sys_log_error(
-       "log_error", "Error log file",
+       "log_error",
+       "Log errors to file (instead of stdout).  If file name is not specified "
+       "then 'datadir'/'log-basename'.err or the pid-file path with extension "
+       ".err is used",
        READ_ONLY GLOBAL_VAR(log_error_file_ptr),
        CMD_LINE(OPT_ARG, OPT_LOG_ERROR),
        IN_FS_CHARSET, DEFAULT(disabled_my_option));
@@ -924,7 +927,7 @@ static Sys_var_mybool Sys_log_queries_not_using_indexes(
 
 static Sys_var_ulong Sys_log_warnings(
        "log_warnings",
-       "Log some not critical warnings to the log file",
+       "Log some not critical warnings to the general log file",
        SESSION_VAR(log_warnings),
        CMD_LINE(OPT_ARG, 'W'),
        VALID_RANGE(0, ULONG_MAX), DEFAULT(1), BLOCK_SIZE(1));
@@ -1332,13 +1335,9 @@ static Sys_var_ulong Sys_net_retry_count(
        BLOCK_SIZE(1), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0),
        ON_UPDATE(fix_net_retry_count));
 
-static Sys_var_mybool Sys_new_mode(
-       "new", "Use very new possible \"unsafe\" functions",
-       SESSION_VAR(new_mode), CMD_LINE(OPT_ARG, 'n'), DEFAULT(FALSE));
-
 static Sys_var_mybool Sys_old_mode(
        "old", "Use compatible behavior",
-       READ_ONLY GLOBAL_VAR(old_mode), CMD_LINE(OPT_ARG), DEFAULT(FALSE));
+       SESSION_VAR(old_mode), CMD_LINE(OPT_ARG), DEFAULT(FALSE));
 
 static Sys_var_mybool Sys_old_alter_table(
        "old_alter_table", "Use old, non-optimized alter table",
@@ -1399,18 +1398,28 @@ static Sys_var_ulong Sys_optimizer_search_depth(
        NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0),
        ON_UPDATE(fix_optimizer_search_depth));
 
-static const char *optimizer_switch_names[]=
+/* this is used in the sigsegv handler */
+export const char *optimizer_switch_names[]=
 {
-  "index_merge", "index_merge_union", "index_merge_sort_union",
-  "index_merge_intersection", "engine_condition_pushdown",
+  "index_merge","index_merge_union","index_merge_sort_union",
+  "index_merge_intersection","index_merge_sort_intersection",
+  "engine_condition_pushdown",
   "index_condition_pushdown",
-  "firstmatch","loosescan","materialization", "semijoin",
+  "derived_merge", "derived_with_keys",
+  "firstmatch","loosescan","materialization","in_to_exists","semijoin",
   "partial_match_rowid_merge",
   "partial_match_table_scan",
   "subquery_cache",
-#ifndef DBUG_OFF
+  "mrr",
+  "mrr_cost_based",
+  "mrr_sort_keys",
+  "outer_join_with_cache",
+  "semijoin_with_cache",
+  "join_cache_incremental",
+  "join_cache_hashed",
+  "join_cache_bka",
+  "optimize_join_buffer_size",
   "table_elimination",
-#endif
   "default", NullS
 };
 /** propagates changes to @@engine_condition_pushdown */
@@ -1424,13 +1433,35 @@ static bool fix_optimizer_switch(sys_var *self, THD *thd,
 }
 static Sys_var_flagset Sys_optimizer_switch(
        "optimizer_switch",
-       "optimizer_switch=option=val[,option=val...], where option is one of "
-       "{index_merge, index_merge_union, index_merge_sort_union, "
-       "index_merge_intersection, engine_condition_pushdown, "
-       "index_condition_pushdown, firstmatch, loosescan, materialization, "
-       "semijoin, partial_match_rowid_merge, partial_match_table_scan, "
-       "subquery_cache} "
-       " and val is one of {on, off, default}",
+       "optimizer_switch=option=val[,option=val...], where option is one of {"
+        "derived_merge, "
+        "derived_with_keys, "
+        "firstmatch, "
+        "in_to_exists, "
+        "engine_condition_pushdown, "
+        "index_condition_pushdown, "
+        "index_merge, "
+        "index_merge_intersection, "
+        "index_merge_sort_intersection, "
+        "index_merge_sort_union, "
+        "index_merge_union, "
+        "join_cache_bka, "
+        "join_cache_hashed, "
+        "join_cache_incremental, "
+        "loosescan, "
+        "materialization, "
+        "mrr, "
+        "mrr_cost_based, "
+        "mrr_sort_keys, "
+        "optimize_join_buffer_size, "
+        "outer_join_with_cache, "
+        "partial_match_rowid_merge, "
+        "partial_match_table_scan, "
+        "semijoin, "
+        "semijoin_with_cache, "
+        "subquery_cache, "
+        "table_elimination "
+       "} and val is one of {on, off, default}",
        SESSION_VAR(optimizer_switch), CMD_LINE(REQUIRED_ARG),
        optimizer_switch_names, DEFAULT(OPTIMIZER_SWITCH_DEFAULT),
        NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(NULL),
@@ -1805,11 +1836,36 @@ static Sys_var_ulong Sys_query_cache_min_res_unit(
 static const char *query_cache_type_names[]= { "OFF", "ON", "DEMAND", 0 };
 static bool check_query_cache_type(sys_var *self, THD *thd, set_var *var)
 {
-  if (query_cache.is_disabled())
+  if (query_cache.is_disable_in_progress())
   {
     my_error(ER_QUERY_CACHE_DISABLED, MYF(0));
     return true;
   }
+  if (var->type != OPT_GLOBAL &&
+      global_system_variables.query_cache_type == 0 &&
+      var->value->val_int() != 0)
+  {
+    my_error(ER_QUERY_CACHE_IS_GLOBALY_DISABLED, MYF(0));
+    return true;
+  }
+
+  return false;
+}
+static bool fix_query_cache_type(sys_var *self, THD *thd, enum_var_type type)
+{
+  if (type != OPT_GLOBAL)
+    return false;
+
+  if (global_system_variables.query_cache_type != 0 &&
+      query_cache.is_disabled())
+  {
+    /* if disabling in progress variable will not be set */
+    DBUG_ASSERT(!query_cache.is_disable_in_progress());
+    /* Enable query cache because it was disabled */
+    fix_query_cache_size(0, thd, type);
+  }
+  else if (global_system_variables.query_cache_type == 0)
+    query_cache.disable_query_cache(thd);
   return false;
 }
 static Sys_var_enum Sys_query_cache_type(
@@ -1819,7 +1875,8 @@ static Sys_var_enum Sys_query_cache_type(
        "SELECT SQL_CACHE ... queries",
        SESSION_VAR(query_cache_type), CMD_LINE(REQUIRED_ARG),
        query_cache_type_names, DEFAULT(1), NO_MUTEX_GUARD, NOT_IN_BINLOG,
-       ON_CHECK(check_query_cache_type));
+       ON_CHECK(check_query_cache_type),
+       ON_UPDATE(fix_query_cache_type));
 
 static Sys_var_mybool Sys_query_cache_wlock_invalidate(
        "query_cache_wlock_invalidate",
@@ -1873,6 +1930,7 @@ static Sys_var_enum Slave_exec_mode(
        "between the master and the slave",
        GLOBAL_VAR(slave_exec_mode_options), CMD_LINE(REQUIRED_ARG),
        slave_exec_mode_names, DEFAULT(SLAVE_EXEC_MODE_STRICT));
+
 static const char *slave_type_conversions_name[]= {"ALL_LOSSY", "ALL_NON_LOSSY", 0};
 static Sys_var_set Slave_type_conversions(
        "slave_type_conversions",
@@ -1884,6 +1942,22 @@ static Sys_var_set Slave_type_conversions(
        GLOBAL_VAR(slave_type_conversions_options), CMD_LINE(REQUIRED_ARG),
        slave_type_conversions_name,
        DEFAULT(0));
+
+static Sys_var_mybool Sys_slave_sql_verify_checksum(
+       "slave_sql_verify_checksum",
+       "Force checksum verification of replication events after reading them "
+       "from relay log. Note: Events are always checksum-verified by slave on "
+       "receiving them from the network before writing them to the relay log",
+       GLOBAL_VAR(opt_slave_sql_verify_checksum), CMD_LINE(OPT_ARG),
+       DEFAULT(TRUE));
+
+static Sys_var_mybool Sys_master_verify_checksum(
+       "master_verify_checksum",
+       "Force checksum verification of logged events in the binary log before "
+       "sending them to slaves or printing them in the output of "
+       "SHOW BINLOG EVENTS",
+       GLOBAL_VAR(opt_master_verify_checksum), CMD_LINE(OPT_ARG),
+       DEFAULT(FALSE));
 #endif
 
 
@@ -2443,41 +2517,24 @@ static Sys_var_harows Sys_select_limit(
 static bool update_timestamp(THD *thd, set_var *var)
 {
   if (var->value)
-    thd->set_time((time_t) var->save_result.ulonglong_value);
+  {
+    my_hrtime_t hrtime = { hrtime_from_time(var->save_result.double_value) };
+    thd->set_time(hrtime);
+  }
   else // SET timestamp=DEFAULT
-    thd->user_time= 0;
+    thd->user_time.val= 0;
   return false;
 }
-static ulonglong read_timestamp(THD *thd)
+static double read_timestamp(THD *thd)
 {
-  return (ulonglong) thd->start_time;
-}
-
-
-static bool check_timestamp(sys_var *self, THD *thd, set_var *var)
-{
-  longlong val;
-
-  if (!var->value)
-    return FALSE;
-
-  val= (longlong) var->save_result.ulonglong_value;
-  if (val != 0 &&          // this is how you set the default value
-      (val < TIMESTAMP_MIN_VALUE || val > TIMESTAMP_MAX_VALUE))
-  {
-    char buf[64];
-    my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "timestamp", llstr(val, buf));
-    return TRUE;
-  }
-  return FALSE;
+  return thd->start_time +
+         thd->start_time_sec_part/(double)TIME_SECOND_PART_FACTOR;
 }
-
-
-static Sys_var_session_special Sys_timestamp(
+static Sys_var_session_special_double Sys_timestamp(
        "timestamp", "Set the time for this client",
        sys_var::ONLY_SESSION, NO_CMD_LINE,
-       VALID_RANGE(0, ~(time_t)0), BLOCK_SIZE(1),
-       NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(check_timestamp), 
+       VALID_RANGE(0, TIMESTAMP_MAX_VALUE),
+       NO_MUTEX_GUARD, IN_BINLOG, ON_CHECK(0), 
        ON_UPDATE(update_timestamp), ON_READ(read_timestamp));
 
 static bool update_last_insert_id(THD *thd, set_var *var)
@@ -2733,9 +2790,7 @@ static bool fix_log(char** logname, const char* default_logname,
 {
   if (!*logname) // SET ... = DEFAULT
   {
-    char buff[FN_REFLEN];
-    *logname= my_strdup(make_log_name(buff, default_logname, ext),
-                        MYF(MY_FAE+MY_WME));
+    make_default_log_name(logname, ext, false);
     if (!*logname)
       return true;
   }
@@ -2754,7 +2809,7 @@ static void reopen_general_log(char* name)
 }
 static bool fix_general_log_file(sys_var *self, THD *thd, enum_var_type type)
 {
-  return fix_log(&opt_logname, default_logfile_name, ".log", opt_log,
+  return fix_log(&opt_logname,  opt_log_basename, ".log", opt_log,
                  reopen_general_log);
 }
 static Sys_var_charptr Sys_general_log_path(
@@ -2770,7 +2825,7 @@ static void reopen_slow_log(char* name)
 }
 static bool fix_slow_log_file(sys_var *self, THD *thd, enum_var_type type)
 {
-  return fix_log(&opt_slow_logname, default_logfile_name, "-slow.log",
+  return fix_log(&opt_slow_logname, opt_log_basename, "-slow.log",
                  opt_slow_log, reopen_slow_log);
 }
 static Sys_var_charptr Sys_slow_log_path(
@@ -3158,8 +3213,9 @@ export const char *plugin_maturity_names[]=
 { "unknown", "experimental", "alpha", "beta", "gamma", "stable", 0 };
 static Sys_var_enum Sys_plugin_maturity(
        "plugin_maturity",
-       "The lowest desirable plugin maturity. Plugins less mature than "
-       "that will not be installed or loaded.",
+       "The lowest desirable plugin maturity "
+       "(unknown, experimental, alpha, beta, gamma, or stable). "
+       "Plugins less mature than that will not be installed or loaded.",
        READ_ONLY GLOBAL_VAR(plugin_maturity), CMD_LINE(REQUIRED_ARG),
        plugin_maturity_names, DEFAULT(MariaDB_PLUGIN_MATURITY_UNKNOWN));
 
@@ -3262,15 +3318,6 @@ static Sys_var_ulong Sys_join_cache_level(
        SESSION_VAR(join_cache_level), CMD_LINE(REQUIRED_ARG),
        VALID_RANGE(0, 8), DEFAULT(1), BLOCK_SIZE(1));
 
-static const char *optimizer_use_mrr_names[]= {"auto", "force", "disable", 0};
-static Sys_var_enum Sys_optimizer_use_mrr(
-       "optimizer_use_mrr", "Whether the server should use "
-       "multi-read-range optimization when resolving queries, "
-       "one of AUTO (as appropriate), FORCE (always where applicable), "
-       "DISABLE (never)",
-       SESSION_VAR(optimizer_use_mrr), CMD_LINE(REQUIRED_ARG),
-       optimizer_use_mrr_names, DEFAULT(1));
-
 static Sys_var_ulong Sys_mrr_buffer_size(
        "mrr_buffer_size",
        "Size of buffer to use when using MRR with range access",
@@ -3290,3 +3337,79 @@ static Sys_var_mybool Sys_userstat(
        "INDEX_STATISTICS and TABLE_STATISTICS tables in the INFORMATION_SCHEMA",
        GLOBAL_VAR(opt_userstat_running),
        CMD_LINE(OPT_ARG), DEFAULT(FALSE));
+
+static Sys_var_mybool Sys_binlog_annotate_row_events(
+       "binlog_annotate_rows_events",
+       "Tells the master to annotate RBR events with the statement that "
+       "caused these events",
+       SESSION_VAR(binlog_annotate_rows_events), CMD_LINE(OPT_ARG),
+       DEFAULT(FALSE));
+
+#ifdef HAVE_REPLICATION
+static Sys_var_mybool Sys_replicate_annotate_rows_events(
+       "replicate_annotate_rows_events",
+       "Tells the slave to write annotate rows events recieved from the master "
+       "to its own binary log. Ignored if log_slave_updates is not set",
+       READ_ONLY GLOBAL_VAR(opt_replicate_annotate_rows_events),
+       CMD_LINE(OPT_ARG), DEFAULT(0));
+#endif
+
+#if 0
+static Sys_var_mybool Sys_safemalloc(
+       "safemalloc",
+       "Check all memory allocations for every malloc/free call (can be slow)",
+       GLOBAL_VAR(sf_malloc_trough_check),
+       CMD_LINE(OPT_ARG), DEFAULT(FALSE));
+#endif
+
+static Sys_var_ulonglong Sys_join_buffer_space_limit(
+       "join_buffer_space_limit",
+       "The limit of the space for all join buffers used by a query",
+       SESSION_VAR(join_buff_space_limit), CMD_LINE(REQUIRED_ARG),
+       VALID_RANGE(2048, ULONGLONG_MAX), DEFAULT(16*128*1024),
+       BLOCK_SIZE(2048));
+
+static Sys_var_ulong Sys_progress_report_time(
+       "progress_report_time",
+       "Seconds between sending progress reports to the client for "
+       "time-consuming statements. Set to 0 to disable progress reporting.",
+       SESSION_VAR(progress_report_time), CMD_LINE(REQUIRED_ARG),
+       VALID_RANGE(0, ULONG_MAX), DEFAULT(56), BLOCK_SIZE(1));
+
+static Sys_var_mybool Sys_thread_alarm(
+       "thread_alarm",
+       "Enable system thread alarm calls. Disabling it may be useful "
+       "in debugging or testing, never do it in production",
+       READ_ONLY GLOBAL_VAR(opt_thread_alarm), CMD_LINE(OPT_ARG),
+       DEFAULT(TRUE));
+
+static Sys_var_charptr Sys_log_basename(
+       "log_basename",
+       "Basename for all log files and the .pid file. This sets all log file "
+       "names at once (in 'datadir') and is normally the only option you need "
+       "for specifying log files. This is especially recommend to be set if you "
+       "are using replication as it ensures that your log file names are not "
+       "depending on your host name. Sets names for --log-bin, --log-bin-index, "
+       "--relay-log, --relay-log-index, --general-log-file, "
+       "--log-slow-query-log-file, --log-error-file and --pid-file",
+       READ_ONLY GLOBAL_VAR(opt_log_basename),
+       CMD_LINE(REQUIRED_ARG, OPT_LOG_BASENAME),
+       IN_FS_CHARSET, DEFAULT(0));
+
+static Sys_var_mybool Sys_query_cache_strip_comments(
+       "query_cache_strip_comments",
+       "Strip all comments from a query before storing it "
+       "in the query cache",
+       GLOBAL_VAR(opt_query_cache_strip_comments), CMD_LINE(OPT_ARG),
+       DEFAULT(FALSE));
+
+static ulonglong in_transaction(THD *thd)
+{
+  return test(thd->server_status & SERVER_STATUS_IN_TRANS);
+}
+static Sys_var_session_special Sys_in_transaction(
+       "in_transaction", "Whether there is an active transaction",
+       READ_ONLY sys_var::ONLY_SESSION, NO_CMD_LINE,
+       VALID_RANGE(0, 1), BLOCK_SIZE(1), NO_MUTEX_GUARD,
+       NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(0), ON_READ(in_transaction));
+
diff --git a/sql/sys_vars.h b/sql/sys_vars.h
index 943b26fa5c4..d2a1e2360b6 100644
--- a/sql/sys_vars.h
+++ b/sql/sys_vars.h
@@ -1448,6 +1448,57 @@ public:
   }
 };
 
+
+class Sys_var_session_special_double: public Sys_var_double
+{
+  typedef bool (*session_special_update_function)(THD *thd, set_var *var);
+  typedef double (*session_special_read_function)(THD *thd);
+
+  session_special_read_function read_func;
+  session_special_update_function update_func;
+public:
+  Sys_var_session_special_double(const char *name_arg,
+               const char *comment, int flag_args,
+               CMD_LINE getopt,
+               double min_val, double max_val,
+               PolyLock *lock, enum binlog_status_enum binlog_status_arg,
+               on_check_function on_check_func,
+               session_special_update_function update_func_arg,
+               session_special_read_function read_func_arg,
+               uint deprecated_version=0, const char *substitute=0)
+    : Sys_var_double(name_arg, comment, flag_args, 0,
+              sizeof(double), getopt, min_val,
+              max_val, 0, lock, binlog_status_arg, on_check_func, 0,
+              deprecated_version, substitute),
+      read_func(read_func_arg), update_func(update_func_arg)
+  {
+    DBUG_ASSERT(scope() == ONLY_SESSION);
+    DBUG_ASSERT(getopt.id == -1); // NO_CMD_LINE, because the offset is fake
+  }
+  bool session_update(THD *thd, set_var *var)
+  { return update_func(thd, var); }
+  bool global_update(THD *thd, set_var *var)
+  {
+    DBUG_ASSERT(FALSE);
+    return true;
+  }
+  void session_save_default(THD *thd, set_var *var)
+  { var->value= 0; }
+  void global_save_default(THD *thd, set_var *var)
+  { DBUG_ASSERT(FALSE); }
+  uchar *session_value_ptr(THD *thd, LEX_STRING *base)
+  {
+    thd->sys_var_tmp.double_value= read_func(thd);
+    return (uchar*) &thd->sys_var_tmp.double_value;
+  }
+  uchar *global_value_ptr(THD *thd, LEX_STRING *base)
+  {
+    DBUG_ASSERT(FALSE);
+    return 0;
+  }
+};
+
+
 /**
   The class for read-only variables that show whether a particular
   feature is supported by the server. Example: have_compression
diff --git a/sql/table.cc b/sql/table.cc
index 24b78d24b71..7bb5986f0f1 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -1,4 +1,5 @@
-/* Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2011, Oracle and/or its affiliates.
+   Copyright (c) 2009-2011, Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -36,6 +37,7 @@
 #include "my_md5.h"
 #include "my_bit.h"
 #include "sql_select.h"
+#include "sql_derived.h"
 #include "mdl.h"                 // MDL_wait_for_graph_visitor
 
 /* INFORMATION_SCHEMA name */
@@ -836,8 +838,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
   share->db_record_offset= 1;
   if (db_create_options & HA_OPTION_LONG_BLOB_PTR)
     share->blob_ptr_size= portable_sizeof_char_ptr;
-  /* Set temporarily a good value for db_low_byte_first */
-  share->db_low_byte_first= test(legacy_db_type != DB_TYPE_ISAM);
   error=4;
   share->max_rows= uint4korr(head+18);
   share->min_rows= uint4korr(head+22);
@@ -1592,7 +1592,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
           key_part->null_bit= field->null_bit;
           key_part->store_length+=HA_KEY_NULL_LENGTH;
           keyinfo->flags|=HA_NULL_PART_KEY;
-          keyinfo->extra_length+= HA_KEY_NULL_LENGTH;
           keyinfo->key_length+= HA_KEY_NULL_LENGTH;
         }
         if (field->type() == MYSQL_TYPE_BLOB ||
@@ -1604,7 +1603,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
             key_part->key_part_flag|= HA_BLOB_PART;
           else
             key_part->key_part_flag|= HA_VAR_LENGTH_PART;
-          keyinfo->extra_length+=HA_KEY_BLOB_LENGTH;
           key_part->store_length+=HA_KEY_BLOB_LENGTH;
           keyinfo->key_length+= HA_KEY_BLOB_LENGTH;
         }
@@ -1682,6 +1680,8 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
 #endif
           key_part->key_part_flag|= HA_PART_KEY_SEG;
         }
+        if (field->real_maybe_null())
+          key_part->key_part_flag|= HA_NULL_PART;
         /*
           Sometimes we can compare key parts for equality with memcmp.
           But not always.
@@ -1793,7 +1793,6 @@ static int open_binary_frm(THD *thd, TABLE_SHARE *share, uchar *head,
   share->can_cmp_whole_record= (share->blob_fields == 0 &&
                                 share->varchar_fields == 0);
 
-  share->db_low_byte_first= handler_file->low_byte_first();
   share->column_bitmap_size= bitmap_buffer_size(share->fields);
 
   if (!(bitmaps= (my_bitmap_map*) alloc_root(&share->mem_root,
@@ -2058,7 +2057,12 @@ bool unpack_vcol_info_from_frm(THD *thd,
   vcol_arena= table->expr_arena;
   if (!vcol_arena)
   {
-    Query_arena expr_arena(&table->mem_root, Query_arena::STMT_INITIALIZED);
+    /*
+      We need to use CONVENTIONAL_EXECUTION here to ensure that
+      any new items created by fix_fields() are not reverted.
+    */
+    Query_arena expr_arena(&table->mem_root,
+                           Query_arena::STMT_CONVENTIONAL_EXECUTION);
     if (!(vcol_arena= (Query_arena *) alloc_root(&table->mem_root,
                                                  sizeof(Query_arena))))
       goto err;
@@ -2144,7 +2148,7 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias,
   bool error_reported= FALSE;
   uchar *record, *bitmaps;
   Field **field_ptr, **vfield_ptr;
-  uint8 save_view_prepare_mode= thd->lex->context_analysis_only;
+  uint8 save_context_analysis_only= thd->lex->context_analysis_only;
   DBUG_ENTER("open_table_from_share");
   DBUG_PRINT("enter",("name: '%s.%s'  form: 0x%lx", share->db.str,
                       share->table_name.str, (long) outparam));
@@ -2160,7 +2164,7 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias,
 
   init_sql_alloc(&outparam->mem_root, TABLE_ALLOC_BLOCK_SIZE, 0);
 
-  if (!(outparam->alias= my_strdup(alias, MYF(MY_WME))))
+  if (outparam->alias.copy(alias, strlen(alias), table_alias_charset))
     goto err;
   outparam->quick_keys.init();
   outparam->covering_keys.init();
@@ -2393,12 +2397,11 @@ partititon_err:
 
   /* Check virtual columns against table's storage engine. */
   if (share->vfields && 
-        ((outparam->file && 
-          !outparam->file->check_if_supported_virtual_columns())))
+        !(outparam->file && 
+          (outparam->file->ha_table_flags() & HA_CAN_VIRTUAL_COLUMNS)))
   {
-    my_error(ER_UNSUPPORTED_ACTION_ON_VIRTUAL_COLUMN,
-             MYF(0), share->db_plugin ? plugin_name(share->db_plugin)->str :
-             "Specified storage engine");
+    my_error(ER_UNSUPPORTED_ENGINE_FOR_VIRTUAL_COLUMNS, MYF(0),
+             plugin_name(share->db_plugin)->str);
     error_reported= TRUE;
     goto err;
   }
@@ -2406,7 +2409,7 @@ partititon_err:
   /* Allocate bitmaps */
 
   bitmap_size= share->column_bitmap_size;
-  if (!(bitmaps= (uchar*) alloc_root(&outparam->mem_root, bitmap_size*4)))
+  if (!(bitmaps= (uchar*) alloc_root(&outparam->mem_root, bitmap_size*5)))
     goto err;
   bitmap_init(&outparam->def_read_set,
               (my_bitmap_map*) bitmaps, share->fields, FALSE);
@@ -2416,6 +2419,8 @@ partititon_err:
               (my_bitmap_map*) (bitmaps+bitmap_size*2), share->fields, FALSE);
   bitmap_init(&outparam->tmp_set,
               (my_bitmap_map*) (bitmaps+bitmap_size*3), share->fields, FALSE);
+  bitmap_init(&outparam->eq_join_set,
+              (my_bitmap_map*) (bitmaps+bitmap_size*4), share->fields, FALSE);
   outparam->default_column_bitmaps();
 
   /* The table struct is now initialized;  Open the table */
@@ -2479,7 +2484,7 @@ partititon_err:
                                HA_HAS_OWN_BINLOGGING);
   thd->status_var.opened_tables++;
 
-  thd->lex->context_analysis_only= save_view_prepare_mode;
+  thd->lex->context_analysis_only= save_context_analysis_only;
   DBUG_RETURN (0);
 
  err:
@@ -2492,9 +2497,9 @@ partititon_err:
 #endif
   outparam->file= 0;				// For easier error checking
   outparam->db_stat=0;
-  thd->lex->context_analysis_only= save_view_prepare_mode;
+  thd->lex->context_analysis_only= save_context_analysis_only;
   free_root(&outparam->mem_root, MYF(0));       // Safe to call on bzero'd root
-  my_free((void *) outparam->alias);
+  outparam->alias.free();
   DBUG_RETURN (error);
 }
 
@@ -2518,10 +2523,9 @@ int closefrm(register TABLE *table, bool free_share)
   {
     if (table->s->deleting)
       table->file->extra(HA_EXTRA_PREPARE_FOR_DROP);
-    error=table->file->close();
+    error=table->file->ha_close();
   }
-  my_free((void *) table->alias);
-  table->alias= 0;
+  table->alias.free();
   if (table->expr_arena)
     table->expr_arena->free_items();
   if (table->field)
@@ -2742,13 +2746,17 @@ void open_table_error(TABLE_SHARE *share, int error, int db_errno, int errarg)
 {
   int err_no;
   char buff[FN_REFLEN];
-  myf errortype= ME_ERROR+ME_WAITTANG;
+  myf errortype= ME_ERROR+ME_WAITTANG;          // Write fatals error to log
   DBUG_ENTER("open_table_error");
 
   switch (error) {
   case 7:
   case 1:
-    if (db_errno == ENOENT)
+    /*
+      Test if file didn't exists. We have to also test for EINVAL as this
+      may happen on windows when opening a file with a not legal file name
+    */
+    if (db_errno == ENOENT || db_errno == EINVAL)
       my_error(ER_NO_SUCH_TABLE, MYF(0), share->db.str, share->table_name.str);
     else
     {
@@ -3010,7 +3018,7 @@ File create_frm(THD *thd, const char *name, const char *db,
   if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
     create_flags|= O_EXCL | O_NOFOLLOW;
 
-  /* Fix this when we have new .frm files;  Current limit is 4G rows (QQ) */
+  /* Fix this when we have new .frm files;  Current limit is 4G rows (TODO) */
   if (create_info->max_rows > UINT_MAX32)
     create_info->max_rows= UINT_MAX32;
   if (create_info->min_rows > UINT_MAX32)
@@ -3383,7 +3391,7 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
   const TABLE_FIELD_TYPE *field_def= table_def->field;
   DBUG_ENTER("table_check_intact");
   DBUG_PRINT("info",("table: %s  expected_count: %d",
-                     table->alias, table_def->count));
+                     table->alias.c_ptr(), table_def->count));
 
   /* Whether the table definition has already been validated. */
   if (table->s->table_field_def_cache == table_def)
@@ -3398,7 +3406,7 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
     {
       report_error(ER_COL_COUNT_DOESNT_MATCH_PLEASE_UPDATE,
                    ER(ER_COL_COUNT_DOESNT_MATCH_PLEASE_UPDATE),
-                   table->alias, table_def->count, table->s->fields,
+                   table->alias.c_ptr(), table_def->count, table->s->fields,
                    static_cast<int>(table->s->mysql_version),
                    MYSQL_VERSION_ID);
       DBUG_RETURN(TRUE);
@@ -3406,7 +3414,8 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
     else if (MYSQL_VERSION_ID == table->s->mysql_version)
     {
       report_error(ER_COL_COUNT_DOESNT_MATCH_CORRUPTED,
-                   ER(ER_COL_COUNT_DOESNT_MATCH_CORRUPTED), table->alias,
+                   ER(ER_COL_COUNT_DOESNT_MATCH_CORRUPTED),
+                   table->alias.c_ptr(),
                    table_def->count, table->s->fields);
       DBUG_RETURN(TRUE);
     }
@@ -3418,11 +3427,13 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
       is backward compatible.
     */
   }
-  char buffer[STRING_BUFFER_USUAL_SIZE];
+  char buffer[1024];
   for (i=0 ; i < table_def->count; i++, field_def++)
   {
     String sql_type(buffer, sizeof(buffer), system_charset_info);
     sql_type.length(0);
+    /* Allocate min 256 characters at once */
+    sql_type.extra_allocation(256);
     if (i < table->s->fields)
     {
       Field *field= table->field[i];
@@ -3437,7 +3448,8 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
         */
         report_error(0, "Incorrect definition of table %s.%s: "
                      "expected column '%s' at position %d, found '%s'.",
-                     table->s->db.str, table->alias, field_def->name.str, i,
+                     table->s->db.str, table->alias.c_ptr(),
+                     field_def->name.str, i,
                      field->field_name);
       }
       field->sql_type(sql_type);
@@ -3463,7 +3475,8 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
       {
         report_error(0, "Incorrect definition of table %s.%s: "
                      "expected column '%s' at position %d to have type "
-                     "%s, found type %s.", table->s->db.str, table->alias,
+                     "%s, found type %s.", table->s->db.str,
+                     table->alias.c_ptr(),
                      field_def->name.str, i, field_def->type.str,
                      sql_type.c_ptr_safe());
         error= TRUE;
@@ -3473,7 +3486,8 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
         report_error(0, "Incorrect definition of table %s.%s: "
                      "expected the type of column '%s' at position %d "
                      "to have character set '%s' but the type has no "
-                     "character set.", table->s->db.str, table->alias,
+                     "character set.", table->s->db.str,
+                     table->alias.c_ptr(),
                      field_def->name.str, i, field_def->cset.str);
         error= TRUE;
       }
@@ -3483,7 +3497,8 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
         report_error(0, "Incorrect definition of table %s.%s: "
                      "expected the type of column '%s' at position %d "
                      "to have character set '%s' but found "
-                     "character set '%s'.", table->s->db.str, table->alias,
+                     "character set '%s'.", table->s->db.str,
+                     table->alias.c_ptr(),
                      field_def->name.str, i, field_def->cset.str,
                      field->charset()->csname);
         error= TRUE;
@@ -3494,7 +3509,7 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
       report_error(0, "Incorrect definition of table %s.%s: "
                    "expected column '%s' at position %d to have type %s "
                    " but the column is not found.",
-                   table->s->db.str, table->alias,
+                   table->s->db.str, table->alias.c_ptr(),
                    field_def->name.str, i, field_def->type.str);
       error= TRUE;
     }
@@ -3713,12 +3728,8 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
                                    s->table_name.str,
                                    tl->alias);
   /* Fix alias if table name changes. */
-  if (strcmp(alias, tl->alias))
-  {
-    uint length= (uint) strlen(tl->alias)+1;
-    alias= (char*) my_realloc((char*) alias, length, MYF(MY_WME));
-    memcpy((char*) alias, tl->alias, length);
-  }
+  if (strcmp(alias.c_ptr(), tl->alias))
+    alias.copy(tl->alias, strlen(tl->alias), alias.charset());
 
   tablenr= thd->current_tablenr++;
   used_fields= 0;
@@ -3733,6 +3744,7 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
   fulltext_searched= 0;
   file->ha_start_of_new_statement();
   reginfo.impossible_range= 0;
+  created= TRUE;
 
   /* Catch wrong handling of the auto_increment_field_not_null. */
   DBUG_ASSERT(!auto_increment_field_not_null);
@@ -3750,6 +3762,14 @@ void TABLE::init(THD *thd, TABLE_LIST *tl)
   /* mark the record[0] uninitialized */
   TRASH(record[0], s->reclength);
 
+  /*
+    Initialize the null marker bits, to ensure that if we are doing a read
+    of only selected columns (like in keyread), all null markers are
+    initialized.
+  */
+  memset(record[0], 255, s->null_bytes); 
+  memset(record[1], 255, s->null_bytes); 
+
   /* Tables may be reused in a sub statement. */
   DBUG_ASSERT(!file->extra(HA_EXTRA_IS_ATTACHED_CHILDREN));
 }
@@ -3835,129 +3855,112 @@ void  TABLE_LIST::calc_md5(char *buffer)
 
 
 /**
-   @brief Set underlying table for table place holder of view.
-
-   @details
-
-   Replace all views that only use one table with the table itself.  This
-   allows us to treat the view as a simple table and even update it (it is a
-   kind of optimization).
+  @brief
+  Create field translation for mergeable derived table/view.
 
-   @note 
+  @param thd  Thread handle
 
-   This optimization is potentially dangerous as it makes views
-   masquerade as base tables: Views don't have the pointer TABLE_LIST::table
-   set to non-@c NULL.
+  @details
+  Create field translation for mergeable derived table/view.
 
-   We may have the case where a view accesses tables not normally accessible
-   in the current Security_context (only in the definer's
-   Security_context). According to the table's GRANT_INFO (TABLE::grant),
-   access is fulfilled, but this is implicitly meant in the definer's security
-   context. Hence we must never look at only a TABLE's GRANT_INFO without
-   looking at the one of the referring TABLE_LIST.
+  @return FALSE ok.
+  @return TRUE an error occur.
 */
 
-void TABLE_LIST::set_underlying_merge()
+bool TABLE_LIST::create_field_translation(THD *thd)
 {
-  TABLE_LIST *tbl;
+  Item *item;
+  Field_translator *transl;
+  SELECT_LEX *select= get_single_select();
+  List_iterator_fast<Item> it(select->item_list);
+  uint field_count= 0;
+  Query_arena *arena= thd->stmt_arena, backup;
+  bool res= FALSE;
 
-  if ((tbl= merge_underlying_list))
+  used_items.empty();
+
+  if (field_translation)
   {
-    /* This is a view. Process all tables of view */
-    DBUG_ASSERT(view && effective_algorithm == VIEW_ALGORITHM_MERGE);
-    do
+    /*
+      Update items in the field translation aftet view have been prepared.
+      It's needed because some items in the select list, like IN subselects,
+      might be substituted for optimized ones.
+    */
+    if (is_view() && get_unit()->prepared && !field_translation_updated)
     {
-      if (tbl->merge_underlying_list)          // This is a view
+      while ((item= it++))
       {
-        DBUG_ASSERT(tbl->view &&
-                    tbl->effective_algorithm == VIEW_ALGORITHM_MERGE);
-        /*
-          This is the only case where set_ancestor is called on an object
-          that may not be a view (in which case ancestor is 0)
-        */
-        tbl->merge_underlying_list->set_underlying_merge();
+        field_translation[field_count++].item= item;
       }
-    } while ((tbl= tbl->next_local));
-
-    if (!multitable_view)
-    {
-      table= merge_underlying_list->table;
-      schema_table= merge_underlying_list->schema_table;
+      field_translation_updated= TRUE;
     }
+
+    return FALSE;
+  }
+
+  if (arena->is_conventional())
+    arena= 0;                                   // For easier test
+  else
+    thd->set_n_backup_active_arena(arena, &backup);
+
+  /* Create view fields translation table */
+
+  if (!(transl=
+        (Field_translator*)(thd->stmt_arena->
+                            alloc(select->item_list.elements *
+                                  sizeof(Field_translator)))))
+  {
+    res= TRUE;
+    goto exit;
+  }
+
+  while ((item= it++))
+  {
+    transl[field_count].name= item->name;
+    transl[field_count++].item= item;
   }
+  field_translation= transl;
+  field_translation_end= transl + field_count;
+
+exit:
+  if (arena)
+    thd->restore_active_arena(arena, &backup);
+
+  return res;
 }
 
 
-/*
-  setup fields of placeholder of merged VIEW
+/**
+  @brief
+  Create field translation for mergeable derived table/view.
 
-  SYNOPSIS
-    TABLE_LIST::setup_underlying()
-    thd		    - thread handler
+  @param thd  Thread handle
 
-  DESCRIPTION
-    It is:
-    - preparing translation table for view columns
-    If there are underlying view(s) procedure first will be called for them.
+  @details
+  Create field translation for mergeable derived table/view.
 
-  RETURN
-    FALSE - OK
-    TRUE  - error
+  @return FALSE ok.
+  @return TRUE an error occur.
 */
 
 bool TABLE_LIST::setup_underlying(THD *thd)
 {
   DBUG_ENTER("TABLE_LIST::setup_underlying");
 
-  if (!field_translation && merge_underlying_list)
+  if (!view || (!field_translation && merge_underlying_list))
   {
-    Field_translator *transl;
-    SELECT_LEX *select= &view->select_lex;
-    Item *item;
-    TABLE_LIST *tbl;
-    List_iterator_fast<Item> it(select->item_list);
-    uint field_count= 0;
-
-    if (check_stack_overrun(thd, STACK_MIN_SIZE, (uchar*) &field_count))
-    {
-      DBUG_RETURN(TRUE);
-    }
-
-    for (tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
-    {
-      if (tbl->merge_underlying_list &&
-          tbl->setup_underlying(thd))
-      {
-        DBUG_RETURN(TRUE);
-      }
-    }
-
-    /* Create view fields translation table */
-
-    if (!(transl=
-          (Field_translator*)(thd->stmt_arena->
-                              alloc(select->item_list.elements *
-                                    sizeof(Field_translator)))))
-    {
+    SELECT_LEX *select= get_single_select();
+    
+    if (create_field_translation(thd))
       DBUG_RETURN(TRUE);
-    }
-
-    while ((item= it++))
-    {
-      transl[field_count].name= item->name;
-      transl[field_count++].item= item;
-    }
-    field_translation= transl;
-    field_translation_end= transl + field_count;
-    /* TODO: use hash for big number of fields */
 
     /* full text function moving to current select */
-    if (view->select_lex.ftfunc_list->elements)
+    if (select->ftfunc_list->elements)
     {
       Item_func_match *ifm;
       SELECT_LEX *current_select= thd->lex->current_select;
       List_iterator_fast<Item_func_match>
-        li(*(view->select_lex.ftfunc_list));
+        li(*(select_lex->ftfunc_list));
       while ((ifm= li++))
         current_select->ftfunc_list->push_front(ifm);
     }
@@ -3967,7 +3970,7 @@ bool TABLE_LIST::setup_underlying(THD *thd)
 
 
 /*
-  Prepare where expression of view
+   Prepare where expression of derived table/view
 
   SYNOPSIS
     TABLE_LIST::prep_where()
@@ -3991,7 +3994,8 @@ bool TABLE_LIST::prep_where(THD *thd, Item **conds,
 
   for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
   {
-    if (tbl->view && tbl->prep_where(thd, conds, no_where_clause))
+    if (tbl->is_view_or_derived() &&
+        tbl->prep_where(thd, conds, no_where_clause))
     {
       DBUG_RETURN(TRUE);
     }
@@ -3999,6 +4003,8 @@ bool TABLE_LIST::prep_where(THD *thd, Item **conds,
 
   if (where)
   {
+    if (where->fixed)
+      where->update_used_tables();
     if (!where->fixed && where->fix_fields(thd, &where))
     {
       DBUG_RETURN(TRUE);
@@ -4031,7 +4037,13 @@ bool TABLE_LIST::prep_where(THD *thd, Item **conds,
         }
       }
       if (tbl == 0)
+      {
+        if (*conds && !(*conds)->fixed)
+	  (*conds)->fix_fields(thd, conds);
         *conds= and_conds(*conds, where->copy_andor_structure(thd));
+        if (*conds && !(*conds)->fixed)
+          (*conds)->fix_fields(thd, conds);        
+      }
       if (arena)
         thd->restore_active_arena(arena, &backup);
       where_processed= TRUE;
@@ -4070,10 +4082,11 @@ merge_on_conds(THD *thd, TABLE_LIST *table, bool is_cascaded)
   DBUG_PRINT("info", ("alias: %s", table->alias));
   if (table->on_expr)
     cond= table->on_expr->copy_andor_structure(thd);
-  if (!table->nested_join)
+  if (!table->view)
     DBUG_RETURN(cond);
-  List_iterator<TABLE_LIST> li(table->nested_join->join_list);
-  while (TABLE_LIST *tbl= li++)
+  for (TABLE_LIST *tbl= (TABLE_LIST*)table->view->select_lex.table_list.first;
+       tbl;
+       tbl= tbl->next_local)
   {
     if (tbl->view && !is_cascaded)
       continue;
@@ -4113,7 +4126,7 @@ bool TABLE_LIST::prep_check_option(THD *thd, uint8 check_opt_type)
 {
   DBUG_ENTER("TABLE_LIST::prep_check_option");
   bool is_cascaded= check_opt_type == VIEW_CHECK_CASCADED;
-
+  TABLE_LIST *merge_underlying_list= view->select_lex.get_table_list();
   for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
   {
     /* see comment of check_opt_type parameter */
@@ -4130,7 +4143,6 @@ bool TABLE_LIST::prep_check_option(THD *thd, uint8 check_opt_type)
 
     if (where)
     {
-      DBUG_ASSERT(where->fixed);
       check_option= where->copy_andor_structure(thd);
     }
     if (is_cascaded)
@@ -4226,10 +4238,14 @@ void TABLE_LIST::hide_view_error(THD *thd)
 TABLE_LIST *TABLE_LIST::find_underlying_table(TABLE *table_to_find)
 {
   /* is this real table and table which we are looking for? */
-  if (table == table_to_find && merge_underlying_list == 0)
+  if (table == table_to_find && view == 0)
     return this;
+  if (!view)
+    return 0;
 
-  for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
+  for (TABLE_LIST *tbl= view->select_lex.get_table_list();
+       tbl;
+       tbl= tbl->next_local)
   {
     TABLE_LIST *result;
     if ((result= tbl->find_underlying_table(table_to_find)))
@@ -4311,7 +4327,12 @@ bool TABLE_LIST::check_single_table(TABLE_LIST **table_arg,
                                        table_map map,
                                        TABLE_LIST *view_arg)
 {
-  for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
+  if (!select_lex)
+    return FALSE;
+  DBUG_ASSERT(is_merged_derived());
+  for (TABLE_LIST *tbl= get_single_select()->get_table_list();
+       tbl;
+       tbl= tbl->next_local)
   {
     if (tbl->table)
     {
@@ -4353,8 +4374,10 @@ bool TABLE_LIST::set_insert_values(MEM_ROOT *mem_root)
   }
   else
   {
-    DBUG_ASSERT(view && merge_underlying_list);
-    for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
+    DBUG_ASSERT(is_view_or_derived() && is_merged_derived());
+    for (TABLE_LIST *tbl= (TABLE_LIST*)view->select_lex.table_list.first;
+         tbl;
+         tbl= tbl->next_local)
       if (tbl->set_insert_values(mem_root))
         return TRUE;
   }
@@ -4380,7 +4403,7 @@ bool TABLE_LIST::set_insert_values(MEM_ROOT *mem_root)
 */
 bool TABLE_LIST::is_leaf_for_name_resolution()
 {
-  return (view || is_natural_join || is_join_columns_complete ||
+  return (is_merged_derived() || is_natural_join || is_join_columns_complete ||
           !nested_join);
 }
 
@@ -4518,7 +4541,11 @@ void TABLE_LIST::register_want_access(ulong want_access)
     if (table)
       table->grant.want_privilege= want_access;
   }
-  for (TABLE_LIST *tbl= merge_underlying_list; tbl; tbl= tbl->next_local)
+  if (!view)
+    return;
+  for (TABLE_LIST *tbl= view->select_lex.get_table_list();
+       tbl;
+       tbl= tbl->next_local)
     tbl->register_want_access(want_access);
 }
 
@@ -4754,14 +4781,23 @@ const char *Natural_join_column::db_name()
                       table_ref->table->s->db.str) ||
               (table_ref->schema_table &&
                is_infoschema_db(table_ref->table->s->db.str,
-                                table_ref->table->s->db.length)));
+                                table_ref->table->s->db.length)) ||
+               table_ref->is_materialized_derived());
   return table_ref->db;
 }
 
 
 GRANT_INFO *Natural_join_column::grant()
 {
-  if (view_field)
+/*  if (view_field)
+    return &(table_ref->grant);
+  return &(table_ref->table->grant);*/
+  /*
+    Have to check algorithm because merged derived also has
+    field_translation.
+  */
+//if (table_ref->effective_algorithm == DTYPE_ALGORITHM_MERGE)
+  if (table_ref->is_merged_derived())
     return &(table_ref->grant);
   return &(table_ref->table->grant);
 }
@@ -4842,7 +4878,17 @@ Item *create_view_field(THD *thd, TABLE_LIST *view, Item **field_ref,
   {
     DBUG_RETURN(field);
   }
-  Item *item= new Item_direct_view_ref(view, field_ref, name);
+  Item *item= new Item_direct_view_ref(&view->view->select_lex.context,
+                                       field_ref, view->alias,
+                                       name, view);
+  /*
+    Force creation of nullable item for the result tmp table for outer joined
+    views/derived tables.
+  */
+  if (view->table && view->table->maybe_null)
+    item->maybe_null= TRUE;
+  /* Save item in case we will need to fall back to materialization. */
+  view->used_items.push_back(item);
   DBUG_RETURN(item);
 }
 
@@ -4896,8 +4942,7 @@ void Field_iterator_table_ref::set_field_iterator()
   /* This is a merge view, so use field_translation. */
   else if (table_ref->field_translation)
   {
-    DBUG_ASSERT(table_ref->view &&
-                table_ref->effective_algorithm == VIEW_ALGORITHM_MERGE);
+    DBUG_ASSERT(table_ref->is_merged_derived());
     field_it= &view_field_it;
     DBUG_PRINT("info", ("field_it for '%s' is Field_iterator_view",
                         table_ref->alias));
@@ -5522,43 +5567,93 @@ void TABLE::mark_virtual_columns_for_write(bool insert_fl)
 
 
 /**
+  @brief
   Allocate space for keys
 
-  @param key_count  number of keys to allocate.
+  @param key_count  number of keys to allocate
 
   @details
-  Allocates space enough to fit 'key_count' keys for this table.
+  The function allocates memory  to fit 'key_count' keys for this table.
 
-  @return FALSE space was successfully allocated.
-  @return TRUE an error occur.
+  @return FALSE   space was successfully allocated
+  @return TRUE    an error occur
 */
 
 bool TABLE::alloc_keys(uint key_count)
 {
-  DBUG_ASSERT(!s->keys);
   key_info= s->key_info= (KEY*) alloc_root(&mem_root, sizeof(KEY)*key_count);
+  s->keys= 0;
   max_keys= key_count;
   return !(key_info);
 }
 
 
+void TABLE::create_key_part_by_field(KEY *keyinfo,
+                                     KEY_PART_INFO *key_part_info,
+                                     Field *field, uint fieldnr)
+{   
+  field->flags|= PART_KEY_FLAG;
+  key_part_info->null_bit= field->null_bit;
+  key_part_info->null_offset= (uint) (field->null_ptr -
+                                      (uchar*) record[0]);
+  key_part_info->field= field;
+  key_part_info->fieldnr= fieldnr;
+  key_part_info->offset= field->offset(record[0]);
+  key_part_info->length=   (uint16) field->pack_length();
+  keyinfo->key_length+= key_part_info->length;
+  key_part_info->key_part_flag= 0;
+  /* TODO:
+    The below method of computing the key format length of the
+    key part is a copy/paste from opt_range.cc, and table.cc.
+    This should be factored out, e.g. as a method of Field.
+    In addition it is not clear if any of the Field::*_length
+    methods is supposed to compute the same length. If so, it
+    might be reused.
+  */
+  key_part_info->store_length= key_part_info->length;
+
+  if (field->real_maybe_null())
+  {
+    key_part_info->store_length+= HA_KEY_NULL_LENGTH;
+    keyinfo->key_length+= HA_KEY_NULL_LENGTH;
+  }
+  if (field->type() == MYSQL_TYPE_BLOB || 
+      field->real_type() == MYSQL_TYPE_VARCHAR)
+  {
+    key_part_info->store_length+= HA_KEY_BLOB_LENGTH;
+    keyinfo->key_length+= HA_KEY_BLOB_LENGTH; // ???
+    key_part_info->key_part_flag|=
+      field->type() == MYSQL_TYPE_BLOB ? HA_BLOB_PART: HA_VAR_LENGTH_PART;
+  }
+
+  key_part_info->type=     (uint8) field->key_type();
+  key_part_info->key_type =
+    ((ha_base_keytype) key_part_info->type == HA_KEYTYPE_TEXT ||
+    (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT1 ||
+    (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT2) ?
+    0 : FIELDFLAG_BINARY;
+}
+
+
 /**
-  Add a key to a temporary  table
+  @brief
+  Add one key to a temporary table
 
   @param key            the number of the key
   @param key_parts      number of components of the key
   @param next_field_no  the call-back function that returns the number of
                         the field used as the next component of the key
   @param arg            the argument for the above function
-  @param unique         Is it unique index
+  @param unique         TRUE <=> it is a unique index
 
   @details
-  The function adds a new key to the table that is assumed to be
-  temprary table. The call-back function must at each call must return
-  the number of the field that used as next component of this key
+  The function adds a new key to the table that is assumed to be a temporary
+  table. At each its invocation the call-back function must return
+  the number of the field that is used as the next component of this key.
 
   @return FALSE is a success
   @return TRUE if a failure
+
 */
 
 bool TABLE::add_tmp_key(uint key, uint key_parts,
@@ -5592,59 +5687,63 @@ bool TABLE::add_tmp_key(uint key, uint key_parts,
   if (!keyinfo->rec_per_key)
     return TRUE;
   bzero(keyinfo->rec_per_key, sizeof(ulong)*key_parts);
+
   for (i= 0; i < key_parts; i++)
   {
-    reg_field= field + next_field_no(arg);
+    uint fld_idx= next_field_no(arg); 
+    reg_field= field + fld_idx;
     if (key_start)
       (*reg_field)->key_start.set_bit(key);
+    (*reg_field)->part_of_key.set_bit(key);
+    create_key_part_by_field(keyinfo, key_part_info, *reg_field, fld_idx+1);
     key_start= FALSE;
-      (*reg_field)->part_of_key.set_bit(key);
-    (*reg_field)->flags|= PART_KEY_FLAG;
-    key_part_info->null_bit= (*reg_field)->null_bit;
-    key_part_info->null_offset= (uint) ((*reg_field)->null_ptr -
-                                          (uchar*) record[0]);
-    key_part_info->field=    *reg_field;
-    key_part_info->offset=   (*reg_field)->offset(record[0]);
-    key_part_info->length=   (uint16) (*reg_field)->pack_length();
-    keyinfo->key_length+= key_part_info->length;
-    key_part_info->key_part_flag= 0;
-    /* TODO:
-      The below method of computing the key format length of the
-      key part is a copy/paste from opt_range.cc, and table.cc.
-      This should be factored out, e.g. as a method of Field.
-      In addition it is not clear if any of the Field::*_length
-      methods is supposed to compute the same length. If so, it
-      might be reused.
-    */
-    key_part_info->store_length= key_part_info->length;
-
-    if ((*reg_field)->real_maybe_null())
-    {
-      key_part_info->store_length+= HA_KEY_NULL_LENGTH;
-      keyinfo->key_length+= HA_KEY_NULL_LENGTH;
-      if (unique)
-        keyinfo->flags|= HA_NULL_ARE_EQUAL;     // def. that NULL == NULL
-    }
-    if ((*reg_field)->type() == MYSQL_TYPE_BLOB || 
-        (*reg_field)->real_type() == MYSQL_TYPE_VARCHAR)
-    {
-      key_part_info->store_length+= HA_KEY_BLOB_LENGTH;
-      keyinfo->key_length+= HA_KEY_BLOB_LENGTH; // ???
-    }
-
-    key_part_info->type=     (uint8) (*reg_field)->key_type();
-    key_part_info->key_type =
-      ((ha_base_keytype) key_part_info->type == HA_KEYTYPE_TEXT ||
-       (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT1 ||
-       (ha_base_keytype) key_part_info->type == HA_KEYTYPE_VARTEXT2) ?
-      0 : FIELDFLAG_BINARY;
     key_part_info++;
   }
+
   set_if_bigger(s->max_key_length, keyinfo->key_length);
   s->keys++;
   return FALSE;
 }
 
+/*
+  @brief
+  Drop all indexes except specified one.
+
+  @param key_to_save the key to save
+
+  @details
+  Drop all indexes on this table except 'key_to_save'. The saved key becomes
+  key #0. Memory occupied by key parts of dropped keys are freed.
+  If the 'key_to_save' is negative then all keys are freed.
+*/
+
+void TABLE::use_index(int key_to_save)
+{
+  uint i= 1;
+  DBUG_ASSERT(!created && key_to_save < (int)s->keys);
+  if (key_to_save >= 0)
+    /* Save the given key. */
+    memmove(key_info, key_info + key_to_save, sizeof(KEY));
+  else
+    /* Drop all keys; */
+    i= 0;
+
+  s->keys= (key_to_save < 0) ? 0 : 1;
+}
+
+/*
+  Return TRUE if the table is filled at execution phase 
+  
+  (and so, the optimizer must not do anything that depends on the contents of
+   the table, like range analysis or constant table detection)
+*/
+
+bool TABLE::is_filled_at_execution()
+{ 
+  return test(pos_in_table_list->jtbm_subselect || 
+              pos_in_table_list->is_active_sjm());
+}
+
 
 /*
   Cleanup this table for re-execution.
@@ -5678,6 +5777,7 @@ void TABLE_LIST::reinit_before_use(THD *thd)
   mdl_request.ticket= NULL;
 }
 
+
 /*
   Return subselect that contains the FROM list this table is taken from
 
@@ -6020,6 +6120,297 @@ int update_virtual_fields(THD *thd, TABLE *table, bool for_write)
   DBUG_RETURN(0);
 }
 
+/*
+  @brief Reset const_table flag
+
+  @detail
+  Reset const_table flag for this table. If this table is a merged derived
+  table/view the flag is recursively reseted for all tables of the underlying
+  select.
+*/
+
+void TABLE_LIST::reset_const_table()
+{
+  table->const_table= 0;
+  if (is_merged_derived())
+  {
+    SELECT_LEX *select_lex= get_unit()->first_select();
+    TABLE_LIST *tl;
+    List_iterator<TABLE_LIST> ti(select_lex->leaf_tables);
+    while ((tl= ti++))
+      tl->reset_const_table();
+  }
+}
+
+
+/*
+  @brief Run derived tables/view handling phases on underlying select_lex.
+
+  @param lex    LEX for this thread
+  @param phases derived tables/views handling phases to run
+                (set of DT_XXX constants)
+  @details
+  This function runs this derived table through specified 'phases'.
+  Underlying tables of this select are handled prior to this derived.
+  'lex' is passed as an argument to called functions.
+
+  @return TRUE on error
+  @return FALSE ok
+*/
+
+bool TABLE_LIST::handle_derived(LEX *lex, uint phases)
+{
+  SELECT_LEX_UNIT *unit= get_unit();
+  if (unit)
+  {
+    for (SELECT_LEX *sl= unit->first_select(); sl; sl= sl->next_select())
+      if (sl->handle_derived(lex, phases))
+        return TRUE;
+    return mysql_handle_single_derived(lex, this, phases);
+  }
+  return FALSE;
+}
+
+
+/**
+  @brief
+  Return unit of this derived table/view
+
+  @return reference to a unit  if it's a derived table/view.
+  @return 0                    when it's not a derived table/view.
+*/
+
+st_select_lex_unit *TABLE_LIST::get_unit()
+{
+  return (view ? &view->unit : derived);
+}
+
+
+/**
+  @brief
+  Return select_lex of this derived table/view
+
+  @return select_lex of this derived table/view.
+  @return 0          when it's not a derived table.
+*/
+
+st_select_lex *TABLE_LIST::get_single_select()
+{
+  SELECT_LEX_UNIT *unit= get_unit();
+  return (unit ? unit->first_select() : 0);
+}
+
+
+/**
+  @brief
+  Attach a join table list as a nested join to this TABLE_LIST.
+
+  @param join_list join table list to attach
+
+  @details
+  This function wraps 'join_list' into a nested_join of this table, thus
+  turning it to a nested join leaf.
+*/
+
+void TABLE_LIST::wrap_into_nested_join(List<TABLE_LIST> &join_list)
+{
+  TABLE_LIST *tl;
+  /*
+    Walk through derived table top list and set 'embedding' to point to
+    the nesting table.
+  */
+  nested_join->join_list.empty();
+  List_iterator_fast<TABLE_LIST> li(join_list);
+  nested_join->join_list= join_list;
+  while ((tl= li++))
+  {
+    tl->embedding= this;
+    tl->join_list= &nested_join->join_list;
+  }
+}
+
+
+/**
+  @brief
+  Initialize this derived table/view
+
+  @param thd  Thread handle
+
+  @details
+  This function makes initial preparations of this derived table/view for
+  further processing:
+    if it's a derived table this function marks it either as mergeable or
+      materializable
+    creates temporary table for name resolution purposes
+    creates field translation for mergeable derived table/view
+
+  @return TRUE  an error occur
+  @return FALSE ok
+*/
+
+bool TABLE_LIST::init_derived(THD *thd, bool init_view)
+{
+  SELECT_LEX *first_select= get_single_select();
+  SELECT_LEX_UNIT *unit= get_unit();
+
+  if (!unit)
+    return FALSE;
+  /*
+    Check whether we can merge this derived table into main select.
+    Depending on the result field translation will or will not
+    be created.
+  */
+  TABLE_LIST *first_table= (TABLE_LIST *) first_select->table_list.first;
+  if (first_select->table_list.elements > 1 ||
+      (first_table && first_table->is_multitable()))
+    set_multitable();
+
+  unit->derived= this;
+  if (init_view && !view)
+  {
+    /* This is all what we can do for a derived table for now. */
+    set_derived();
+  }
+
+  if (!is_view())
+  {
+    /* A subquery might be forced to be materialized due to a side-effect. */
+    if (!is_materialized_derived() && first_select->is_mergeable() &&
+        optimizer_flag(thd, OPTIMIZER_SWITCH_DERIVED_MERGE) &&
+        !(thd->lex->sql_command == SQLCOM_UPDATE_MULTI ||
+          thd->lex->sql_command == SQLCOM_DELETE_MULTI))
+      set_merged_derived();
+    else
+      set_materialized_derived();
+  }
+  /*
+    Derived tables/view are materialized prior to UPDATE, thus we can skip
+    them from table uniqueness check
+  */
+  if (is_materialized_derived())
+  {
+    unit->master_unit()->set_unique_exclude();
+  }
+  /*
+    Create field translation for mergeable derived tables/views.
+    For derived tables field translation can be created only after
+    unit is prepared so all '*' are get unrolled.
+  */
+  if (is_merged_derived())
+  {
+    if (is_view() || unit->prepared)
+      create_field_translation(thd);
+  }
+
+  return FALSE;
+}
+
+
+/**
+  @brief
+  Retrieve number of rows in the table
+
+  @details
+  Retrieve number of rows in the table referred by this TABLE_LIST and
+  store it in the table's stats.records variable. If this TABLE_LIST refers
+  to a materialized derived table/view then the estimated number of rows of
+  the derived table/view is used instead.
+
+  @return 0          ok
+  @return non zero   error
+*/
+
+int TABLE_LIST::fetch_number_of_rows()
+{
+  int error= 0;
+  if (is_materialized_derived() && !fill_me)
+
+  {
+    table->file->stats.records= ((select_union*)derived->result)->records;
+    set_if_bigger(table->file->stats.records, 2);
+  }
+  else
+    error= table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK);
+  return error;
+}
+
+/*
+  Procedure of keys generation for result tables of materialized derived
+  tables/views.
+
+  A key is generated for each equi-join pair derived table-another table.
+  Each generated key consists of fields of derived table used in equi-join.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=t1.f3 and tt.f2.=t1.f4;
+  In this case for the derived table tt one key will be generated. It will
+  consist of two parts f1 and f2.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=t1.f3 JOIN
+                  t2 ON tt.f2=t2.f4;
+  In this case for the derived table tt two keys will be generated.
+  One key over f1 field, and another key over f2 field.
+  Currently optimizer may choose to use only one such key, thus the second
+  one will be dropped after range optimizer is finished.
+  See also JOIN::drop_unused_derived_keys function.
+  Example:
+
+    SELECT * FROM (SELECT * FROM t1 GROUP BY 1) tt JOIN
+                  t1 ON tt.f1=a_function(t1.f3);
+  In this case for the derived table tt one key will be generated. It will
+  consist of one field - f1.
+*/
+
+
+
+/*
+  @brief
+  Change references to underlying items of a merged derived table/view
+  for fields in derived table's result table.
+
+  @return FALSE ok
+  @return TRUE  Out of memory
+*/
+bool TABLE_LIST::change_refs_to_fields()
+{
+  List_iterator<Item> li(used_items);
+  Item_direct_ref *ref;
+  Field_iterator_view field_it;
+  THD *thd= table->in_use;
+  DBUG_ASSERT(is_merged_derived());
+
+  if (!used_items.elements)
+    return FALSE;
+
+  materialized_items= (Item**)thd->calloc(sizeof(void*) * table->s->fields);
+
+  while ((ref= (Item_direct_ref*)li++))
+  {
+    uint idx;
+    Item *orig_item= *ref->ref;
+    field_it.set(this);
+    for (idx= 0; !field_it.end_of_fields(); field_it.next(), idx++)
+    {
+      if (field_it.item() == orig_item)
+        break;
+    }
+    DBUG_ASSERT(!field_it.end_of_fields());
+    if (!materialized_items[idx])
+    {
+      materialized_items[idx]= new Item_field(table->field[idx]);
+      if (!materialized_items[idx])
+        return TRUE;
+    }
+    ref->ref= materialized_items + idx;
+  }
+
+  return FALSE;
+}
+
+
 /*****************************************************************************
 ** Instansiate templates
 *****************************************************************************/
diff --git a/sql/table.h b/sql/table.h
index 79263723744..8dd1067ba49 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -1,7 +1,7 @@
 #ifndef TABLE_INCLUDED
 #define TABLE_INCLUDED
-
-/* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2010, Oracle and/or its affiliates.
+   Copyright (c) 2009, 2011 Monty Program Ab
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -21,6 +21,7 @@
 #include "sql_list.h"                           /* Sql_alloc */
 #include "mdl.h"
 #include "datadict.h"
+#include "sql_string.h"                         /* String */
 
 #ifndef MYSQL_CLIENT
 
@@ -203,7 +204,8 @@ typedef struct st_order {
   bool   counter_used;                  /* parameter was counter of columns */
   Field  *field;			/* If tmp-table group */
   char	 *buff;				/* If tmp-table group */
-  table_map used, depend_map;
+  table_map used; /* NOTE: the below is only set to 0 but is still used by eq_ref_table */
+  table_map depend_map;
 } ORDER;
 
 /**
@@ -565,7 +567,7 @@ struct TABLE_SHARE
   I_P_List <TABLE, TABLE_share> free_tables;
 
   engine_option_value *option_list;     /* text options for table */
-  void *option_struct;                  /* structure with parsed options */
+  ha_table_option_struct *option_struct; /* structure with parsed options */
 
   /* The following is copied to each TABLE on OPEN */
   Field **field;
@@ -659,7 +661,6 @@ struct TABLE_SHARE
   bool null_field_first;
   bool system;                          /* Set if system table (one record) */
   bool crypted;                         /* If .frm file is crypted */
-  bool db_low_byte_first;		/* Portable row format */
   bool crashed;
   bool is_view;
   bool deleting;                        /* going to delete this table */
@@ -937,7 +938,7 @@ public:
     needed by the query without reading the row.
   */
   key_map covering_keys;
-  key_map quick_keys, merge_keys;
+  key_map quick_keys, merge_keys,intersect_keys;
   /*
     A set of keys that can be used in the query that references this
     table.
@@ -967,10 +968,11 @@ public:
   /* Position in thd->locked_table_list under LOCK TABLES */
   TABLE_LIST *pos_in_locked_tables;
   ORDER		*group;
-  const char	*alias;            	  /* alias or table name */
+  String	alias;            	  /* alias or table name */
   uchar		*null_flags;
   my_bitmap_map	*bitmap_init_value;
   MY_BITMAP     def_read_set, def_write_set, def_vcol_set, tmp_set; 
+  MY_BITMAP     eq_join_set;         /* used to mark equi-joined fields */
   MY_BITMAP     *read_set, *write_set, *vcol_set; /* Active column sets */
   /*
    The ID of the query that opened and is using this table. Has different
@@ -1036,7 +1038,6 @@ public:
   uint          temp_pool_slot;		/* Used by intern temp tables */
   uint		status;                 /* What's in record[0] */
   uint		db_stat;		/* mode of file as in handler.h */
-  uint          max_keys;               /* Size of allocated key_info array. */
   /* number of select if it is derived table */
   uint          derived_select_number;
   int		current_lock;           /* Type of lock on table */
@@ -1072,7 +1073,7 @@ public:
     See TABLE_LIST::process_index_hints().
   */
   bool force_index_group;
-  bool distinct,const_table,no_rows;
+  bool distinct,const_table,no_rows, used_for_duplicate_elimination;
 
   /**
      If set, the optimizer has found that row retrieval should access index 
@@ -1094,9 +1095,10 @@ public:
   */
   bool auto_increment_field_not_null;
   bool insert_or_update;             /* Can be used by the handler */
-  bool alias_name_used;		/* true if table_name is alias */
+  bool alias_name_used;              /* true if table_name is alias */
   bool get_fields_in_item_tree;      /* Signal to fix_field */
   bool m_needs_reopen;
+  bool created;    /* For tmp tables. TRUE <=> tmp table was actually created.*/
 
   REGINFO reginfo;			/* field connections */
   MEM_ROOT mem_root;
@@ -1114,6 +1116,7 @@ public:
   partition_info *part_info;            /* Partition related information */
   bool no_partitions_used; /* If true, all partitions have been pruned away */
 #endif
+  uint max_keys; /* Size of allocated key_info array. */
   MDL_ticket *mdl_ticket;
 
   void init(THD *thd, TABLE_LIST *tl);
@@ -1181,6 +1184,14 @@ public:
   bool add_tmp_key(uint key, uint key_parts,
                    uint (*next_field_no) (uchar *), uchar *arg,
                    bool unique);
+  void create_key_part_by_field(KEY *keyinfo, KEY_PART_INFO *key_part_info,
+                                Field *field, uint fieldnr);
+  void use_index(int key_to_save);
+  void set_table_map(table_map map_arg, uint tablenr_arg)
+  {
+    map= map_arg;
+    tablenr= tablenr_arg;
+  }
   inline void enable_keyread()
   {
     DBUG_ENTER("enable_keyread");
@@ -1189,6 +1200,12 @@ public:
     file->extra(HA_EXTRA_KEYREAD);
     DBUG_VOID_RETURN;
   }
+  /*
+    Returns TRUE if the table is filled at execution phase (and so, the
+    optimizer must not do anything that depends on the contents of the table,
+    like range analysis or constant table detection)
+  */
+  bool is_filled_at_execution();
   inline void disable_keyread()
   {
     DBUG_ENTER("disable_keyread");
@@ -1308,13 +1325,52 @@ typedef struct st_schema_table
 } ST_SCHEMA_TABLE;
 
 
+/*
+  Types of derived tables. The ending part is a bitmap of phases that are
+  applicable to a derived table of the type.
+ * /
+#define VIEW_ALGORITHM_UNDEFINED        0
+#define VIEW_ALGORITHM_MERGE            1 + DT_COMMON + DT_MERGE
+#define DERIVED_ALGORITHM_MERGE         2 + DT_COMMON + DT_MERGE
+#define VIEW_ALGORITHM_TMPTABLE         3 + DT_COMMON + DT_MATERIALIZE
+#define DERIVED_ALGORITHM_MATERIALIZE   4 + DT_COMMON + DT_MATERIALIZE
+*/
+#define DTYPE_ALGORITHM_UNDEFINED    0
+#define DTYPE_VIEW                   1
+#define DTYPE_TABLE                  2
+#define DTYPE_MERGE                  4
+#define DTYPE_MATERIALIZE            8
+#define DTYPE_MULTITABLE             16
+#define DTYPE_MASK                   19
+
+/*
+  Phases of derived tables/views handling, see sql_derived.cc
+  Values are used as parts of a bitmap attached to derived table types.
+*/
+#define DT_INIT             1
+#define DT_PREPARE          2
+#define DT_OPTIMIZE         4
+#define DT_MERGE            8
+#define DT_MERGE_FOR_INSERT 16
+#define DT_CREATE           32
+#define DT_FILL             64
+#define DT_REINIT           128
+#define DT_PHASES           8
+/* Phases that are applicable to all derived tables. */
+#define DT_COMMON       (DT_INIT + DT_PREPARE + DT_REINIT + DT_OPTIMIZE)
+/* Phases that are applicable only to materialized derived tables. */
+#define DT_MATERIALIZE  (DT_CREATE + DT_FILL)
+
+#define DT_PHASES_MERGE (DT_COMMON | DT_MERGE | DT_MERGE_FOR_INSERT)
+#define DT_PHASES_MATERIALIZE (DT_COMMON | DT_MATERIALIZE)
+
+#define VIEW_ALGORITHM_UNDEFINED 0
+#define VIEW_ALGORITHM_MERGE    (DTYPE_VIEW | DTYPE_MERGE)
+#define VIEW_ALGORITHM_TMPTABLE (DTYPE_VIEW + DTYPE_MATERIALIZE )
+
 #define JOIN_TYPE_LEFT	1
 #define JOIN_TYPE_RIGHT	2
 
-#define VIEW_ALGORITHM_UNDEFINED        0
-#define VIEW_ALGORITHM_TMPTABLE         1
-#define VIEW_ALGORITHM_MERGE            2
-
 #define VIEW_SUID_INVOKER               0
 #define VIEW_SUID_DEFINER               1
 #define VIEW_SUID_DEFAULT               2
@@ -1403,7 +1459,7 @@ class Item_in_subselect;
   1) table (TABLE_LIST::view == NULL)
      - base table
        (TABLE_LIST::derived == NULL)
-     - subquery - TABLE_LIST::table is a temp table
+     - FROM-clause subquery - TABLE_LIST::table is a temp table
        (TABLE_LIST::derived != NULL)
      - information schema table
        (TABLE_LIST::schema_table != NULL)
@@ -1413,6 +1469,7 @@ class Item_in_subselect;
            also (TABLE_LIST::field_translation != NULL)
      - tmptable (TABLE_LIST::effective_algorithm == VIEW_ALGORITHM_TMPTABLE)
            also (TABLE_LIST::field_translation == NULL)
+  2.5) TODO: Add derived tables description here
   3) nested table reference (TABLE_LIST::nested_join != NULL)
      - table sequence - e.g. (t1, t2, t3)
        TODO: how to distinguish from a JOIN?
@@ -1422,6 +1479,8 @@ class Item_in_subselect;
        (TABLE_LIST::natural_join != NULL)
        - JOIN ... USING
          (TABLE_LIST::join_using_fields != NULL)
+     - semi-join nest (sj_on_expr!= NULL && sj_subq_pred!=NULL)
+  4) jtbm semi-join (jtbm_subselect != NULL)
 */
 
 struct LEX;
@@ -1475,8 +1534,16 @@ struct TABLE_LIST
   */
   table_map     sj_inner_tables;
   /* Number of IN-compared expressions */
-  uint          sj_in_exprs; 
+  uint          sj_in_exprs;
+  
+  /* If this is a non-jtbm semi-join nest: corresponding subselect predicate */
   Item_in_subselect  *sj_subq_pred;
+
+  /* If this is a jtbm semi-join object: corresponding subselect predicate */
+  Item_in_subselect  *jtbm_subselect;
+  /* TODO: check if this can be joined with tablenr_exec */
+  uint jtbm_table_no;
+
   SJ_MATERIALIZATION_INFO *sj_mat_info;
 
   /*
@@ -1529,6 +1596,8 @@ struct TABLE_LIST
     filling procedure
   */
   select_union  *derived_result;
+  /* Stub used for materialized derived tables. */
+  table_map	map;                    /* ID bit of table (1,2,4,8,16...) */
   /*
     Reference from aux_tables to local list entry of main select of
     multi-delete statement:
@@ -1573,6 +1642,7 @@ struct TABLE_LIST
   Field_translator *field_translation;	/* array of VIEW fields */
   /* pointer to element after last one in translation table above */
   Field_translator *field_translation_end;
+  bool field_translation_updated;
   /*
     List (based on next_local) of underlying tables of this view. I.e. it
     does not include the tables of subqueries used in the view. Is set only
@@ -1587,11 +1657,20 @@ struct TABLE_LIST
   List<TABLE_LIST> *view_tables;
   /* most upper view this table belongs to */
   TABLE_LIST	*belong_to_view;
+  /* A derived table this table belongs to */
+  TABLE_LIST    *belong_to_derived;
   /*
     The view directly referencing this table
     (non-zero only for merged underlying tables of a view).
   */
   TABLE_LIST	*referencing_view;
+
+  table_map view_used_tables;
+  table_map     map_exec;
+  /* TODO: check if this can be joined with jtbm_table_no */
+  uint          tablenr_exec;
+  uint          maybe_null_exec;
+
   /* Ptr to parent MERGE table list item. See top comment in ha_myisammrg.cc */
   TABLE_LIST    *parent_l;
   /*
@@ -1604,13 +1683,7 @@ struct TABLE_LIST
     SQL SECURITY DEFINER)
   */
   Security_context *view_sctx;
-  /*
-    List of all base tables local to a subquery including all view
-    tables. Unlike 'next_local', this in this list views are *not*
-    leaves. Created in setup_tables() -> make_leaves_list().
-  */
   bool allowed_show;
-  TABLE_LIST	*next_leaf;
   Item          *where;                 /* VIEW WHERE clause condition */
   Item          *check_option;          /* WITH CHECK OPTION condition */
   LEX_STRING	select_stmt;		/* text of (CREATE/SELECT) statement */
@@ -1646,7 +1719,7 @@ struct TABLE_LIST
       - VIEW_ALGORITHM_MERGE
       @to do Replace with an enum 
   */
-  uint8         effective_algorithm;
+  uint8         derived_type;
   GRANT_INFO	grant;
   /* data need by some engines in query cache*/
   ulonglong     engine_data;
@@ -1677,7 +1750,6 @@ struct TABLE_LIST
   enum enum_open_type open_type;
   /* TRUE if this merged view contain auto_increment field */
   bool          contain_auto_increment;
-  bool          multitable_view;        /* TRUE iff this is multitable view */
   bool          compact_view_format;    /* Use compact format for SHOW CREATE VIEW */
   /* view where processed */
   bool          where_processed;
@@ -1718,6 +1790,17 @@ struct TABLE_LIST
 
   bool          deleting;               /* going to delete this table */
 
+  /* TRUE <=> derived table should be filled right after optimization. */
+  bool          fill_me;
+  /* TRUE <=> view/DT is merged. */
+  bool          merged;
+  bool          merged_for_insert;
+  /* TRUE <=> don't prepare this derived table/view as it should be merged.*/
+  bool          skip_prepare_derived;
+
+  List<Item>    used_items;
+  Item          **materialized_items;
+
   /* View creation context. */
 
   View_creation_ctx *view_creation_ctx;
@@ -1761,8 +1844,8 @@ struct TABLE_LIST
   MDL_request mdl_request;
 
   void calc_md5(char *buffer);
-  void set_underlying_merge();
   int view_check_option(THD *thd, bool ignore_failure);
+  bool create_field_translation(THD *thd);
   bool setup_underlying(THD *thd);
   void cleanup_items();
   bool placeholder()
@@ -1791,7 +1874,7 @@ struct TABLE_LIST
   inline bool prepare_where(THD *thd, Item **conds,
                             bool no_where_clause)
   {
-    if (effective_algorithm == VIEW_ALGORITHM_MERGE)
+    if (!view || is_merged_derived())
       return prep_where(thd, conds, no_where_clause);
     return FALSE;
   }
@@ -1848,6 +1931,60 @@ struct TABLE_LIST
     m_table_ref_version= table_ref_version_arg;
   }
 
+  /* Set of functions returning/setting state of a derived table/view. */
+  inline bool is_non_derived()
+  {
+    return (!derived_type);
+  }
+  inline bool is_view_or_derived()
+  {
+    return (derived_type);
+  }
+  inline bool is_view()
+  {
+    return (derived_type & DTYPE_VIEW);
+  }
+  inline bool is_derived()
+  {
+    return (derived_type & DTYPE_TABLE);
+  }
+  inline void set_view()
+  {
+    derived_type= DTYPE_VIEW;
+  }
+  inline void set_derived()
+  {
+    derived_type= DTYPE_TABLE;
+  }
+  inline bool is_merged_derived()
+  {
+    return (derived_type & DTYPE_MERGE);
+  }
+  inline void set_merged_derived()
+  {
+    derived_type= ((derived_type & DTYPE_MASK) |
+                    DTYPE_TABLE | DTYPE_MERGE);
+  }
+  inline bool is_materialized_derived()
+  {
+    return (derived_type & DTYPE_MATERIALIZE);
+  }
+  inline void set_materialized_derived()
+  {
+    derived_type= ((derived_type & DTYPE_MASK) |
+                    DTYPE_TABLE | DTYPE_MATERIALIZE);
+  }
+  inline bool is_multitable()
+  {
+    return (derived_type & DTYPE_MULTITABLE);
+  }
+  inline void set_multitable()
+  {
+    derived_type|= DTYPE_MULTITABLE;
+  }
+  void reset_const_table();
+  bool handle_derived(LEX *lex, uint phases);
+
   /**
      @brief True if this TABLE_LIST represents an anonymous derived table,
      i.e.  the result of a subquery.
@@ -1867,6 +2004,14 @@ struct TABLE_LIST
      respectively.
    */
   char *get_table_name() { return view != NULL ? view_name.str : table_name; }
+  bool is_active_sjm();
+  bool is_jtbm() { return test(jtbm_subselect!=NULL); }
+  st_select_lex_unit *get_unit();
+  st_select_lex *get_single_select();
+  void wrap_into_nested_join(List<TABLE_LIST> &join_list);
+  bool init_derived(THD *thd, bool init_view);
+  int fetch_number_of_rows();
+  bool change_refs_to_fields();
 
 private:
   bool prep_check_option(THD *thd, uint8 check_opt_type);
@@ -2044,7 +2189,7 @@ typedef struct st_nested_join
 
      2. All child join nest nodes are fully covered.
    */
-  bool is_fully_covered() const { return join_list.elements == counter; }
+  bool is_fully_covered() const { return n_tables == counter; }
 } NESTED_JOIN;
 
 
diff --git a/sql/tztime.cc b/sql/tztime.cc
index 1a99417fdca..f84c62d98c9 100644
--- a/sql/tztime.cc
+++ b/sql/tztime.cc
@@ -222,7 +222,7 @@ tz_load(const char *name, TIME_ZONE_INFO *sp, MEM_ROOT *storage)
                                          ALIGN_SIZE(sp->typecnt *
                                                     sizeof(TRAN_TYPE_INFO)) +
 #ifdef ABBR_ARE_USED
-                                         ALIGN_SIZE(sp->charcnt) +
+                                         ALIGN_SIZE(sp->charcnt+1) +
 #endif
                                          sp->leapcnt * sizeof(LS_INFO))))
       return 1;
@@ -235,7 +235,7 @@ tz_load(const char *name, TIME_ZONE_INFO *sp, MEM_ROOT *storage)
     tzinfo_buf+= ALIGN_SIZE(sp->typecnt * sizeof(TRAN_TYPE_INFO));
 #ifdef ABBR_ARE_USED
     sp->chars= tzinfo_buf;
-    tzinfo_buf+= ALIGN_SIZE(sp->charcnt);
+    tzinfo_buf+= ALIGN_SIZE(sp->charcnt+1);
 #endif
     sp->lsis= (LS_INFO *)tzinfo_buf;
 
@@ -823,9 +823,11 @@ sec_since_epoch(int year, int mon, int mday, int hour, int min ,int sec)
     TIME_to_gmt_sec()
       t               - pointer to structure for broken down represenatation
       sp              - pointer to struct with time zone description
-      in_dst_time_gap - pointer to bool which is set to true if datetime
-                        value passed doesn't really exist (i.e. falls into
-                        spring time-gap) and is not touched otherwise.
+      error_code      - 0, if the conversion was successful;
+                        ER_WARN_DATA_OUT_OF_RANGE, if t contains datetime value
+                           which is out of TIMESTAMP range;
+                        ER_WARN_INVALID_TIMESTAMP, if t represents value which
+                           doesn't exists (falls into the spring time-gap).
 
   DESCRIPTION
     This is mktime analog for MySQL. It is essentially different
@@ -887,20 +889,23 @@ sec_since_epoch(int year, int mon, int mday, int hour, int min ,int sec)
     Seconds in UTC since Epoch.
     0 in case of error.
 */
+
 static my_time_t
-TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp,
-                my_bool *in_dst_time_gap)
+TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp, uint *error_code)
 {
   my_time_t local_t;
   uint saved_seconds;
   uint i;
   int shift= 0;
-
   DBUG_ENTER("TIME_to_gmt_sec");
 
   if (!validate_timestamp_range(t))
+  {
+    *error_code= ER_WARN_DATA_OUT_OF_RANGE;
     DBUG_RETURN(0);
+  }
 
+  *error_code= 0;
 
   /* We need this for correct leap seconds handling */
   if (t->second < SECS_PER_MIN)
@@ -944,6 +949,7 @@ TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp,
       This means that source time can't be represented as my_time_t due to
       limited my_time_t range.
     */
+    *error_code= ER_WARN_DATA_OUT_OF_RANGE;
     DBUG_RETURN(0);
   }
 
@@ -960,6 +966,7 @@ TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp,
     if (local_t > (my_time_t) (TIMESTAMP_MAX_VALUE - shift * SECS_PER_DAY +
                                sp->revtis[i].rt_offset - saved_seconds))
     {
+      *error_code= ER_WARN_DATA_OUT_OF_RANGE;
       DBUG_RETURN(0);                           /* my_time_t overflow */
     }
     local_t+= shift * SECS_PER_DAY;
@@ -973,7 +980,7 @@ TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp,
       Now we are returning my_time_t value corresponding to the
       beginning of the gap.
     */
-    *in_dst_time_gap= 1;
+    *error_code= ER_WARN_INVALID_TIMESTAMP;
     local_t= sp->revts[i] - sp->revtis[i].rt_offset + saved_seconds;
   }
   else
@@ -981,7 +988,10 @@ TIME_to_gmt_sec(const MYSQL_TIME *t, const TIME_ZONE_INFO *sp,
 
   /* check for TIMESTAMP_MAX_VALUE was already done above */
   if (local_t < TIMESTAMP_MIN_VALUE)
+  {
     local_t= 0;
+    *error_code= ER_WARN_DATA_OUT_OF_RANGE;
+  }
 
   DBUG_RETURN(local_t);
 }
@@ -1015,8 +1025,7 @@ class Time_zone_system : public Time_zone
 {
 public:
   Time_zone_system() {}                       /* Remove gcc warning */
-  virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t,
-                                    my_bool *in_dst_time_gap) const;
+  virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const;
   virtual void gmt_sec_to_TIME(MYSQL_TIME *tmp, my_time_t t) const;
   virtual const String * get_name() const;
 };
@@ -1030,9 +1039,11 @@ public:
     TIME_to_gmt_sec()
       t               - pointer to MYSQL_TIME structure with local time in
                         broken-down representation.
-      in_dst_time_gap - pointer to bool which is set to true if datetime
-                        value passed doesn't really exist (i.e. falls into
-                        spring time-gap) and is not touched otherwise.
+      error_code      - 0, if the conversion was successful;
+                        ER_WARN_DATA_OUT_OF_RANGE, if t contains datetime value
+                           which is out of TIMESTAMP range;
+                        ER_WARN_INVALID_TIMESTAMP, if t represents value which
+                           doesn't exists (falls into the spring time-gap).
 
   DESCRIPTION
     This method uses system function (localtime_r()) for conversion
@@ -1048,10 +1059,10 @@ public:
     Corresponding my_time_t value or 0 in case of error
 */
 my_time_t
-Time_zone_system::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap) const
+Time_zone_system::TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const
 {
   long not_used;
-  return my_system_gmt_sec(t, &not_used, in_dst_time_gap);
+  return my_system_gmt_sec(t, &not_used, error_code);
 }
 
 
@@ -1111,7 +1122,7 @@ class Time_zone_utc : public Time_zone
 public:
   Time_zone_utc() {}                          /* Remove gcc warning */
   virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t,
-                                    my_bool *in_dst_time_gap) const;
+                                    uint *error_code) const;
   virtual void gmt_sec_to_TIME(MYSQL_TIME *tmp, my_time_t t) const;
   virtual const String * get_name() const;
 };
@@ -1120,14 +1131,6 @@ public:
 /*
   Convert UTC time from MYSQL_TIME representation to its my_time_t representation.
 
-  SYNOPSIS
-    TIME_to_gmt_sec()
-      t               - pointer to MYSQL_TIME structure with local time
-                        in broken-down representation.
-      in_dst_time_gap - pointer to bool which is set to true if datetime
-                        value passed doesn't really exist (i.e. falls into
-                        spring time-gap) and is not touched otherwise.
-
   DESCRIPTION
     Since Time_zone_utc is used only internally for my_time_t -> TIME
     conversions, this function of Time_zone interface is not implemented for
@@ -1137,10 +1140,11 @@ public:
     0
 */
 my_time_t
-Time_zone_utc::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap) const
+Time_zone_utc::TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const
 {
   /* Should be never called */
   DBUG_ASSERT(0);
+  *error_code= ER_WARN_DATA_OUT_OF_RANGE;
   return 0;
 }
 
@@ -1200,8 +1204,7 @@ class Time_zone_db : public Time_zone
 {
 public:
   Time_zone_db(TIME_ZONE_INFO *tz_info_arg, const String * tz_name_arg);
-  virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t,
-                                    my_bool *in_dst_time_gap) const;
+  virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const;
   virtual void gmt_sec_to_TIME(MYSQL_TIME *tmp, my_time_t t) const;
   virtual const String * get_name() const;
 private:
@@ -1238,9 +1241,11 @@ Time_zone_db::Time_zone_db(TIME_ZONE_INFO *tz_info_arg,
     TIME_to_gmt_sec()
       t               - pointer to MYSQL_TIME structure with local time
                         in broken-down representation.
-      in_dst_time_gap - pointer to bool which is set to true if datetime
-                        value passed doesn't really exist (i.e. falls into
-                        spring time-gap) and is not touched otherwise.
+      error_code      - 0, if the conversion was successful;
+                        ER_WARN_DATA_OUT_OF_RANGE, if t contains datetime value
+                           which is out of TIMESTAMP range;
+                        ER_WARN_INVALID_TIMESTAMP, if t represents value which
+                           doesn't exists (falls into the spring time-gap).
 
   DESCRIPTION
     Please see ::TIME_to_gmt_sec for function description and
@@ -1250,9 +1255,9 @@ Time_zone_db::Time_zone_db(TIME_ZONE_INFO *tz_info_arg,
     Corresponding my_time_t value or 0 in case of error
 */
 my_time_t
-Time_zone_db::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap) const
+Time_zone_db::TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const
 {
-  return ::TIME_to_gmt_sec(t, tz_info, in_dst_time_gap);
+  return ::TIME_to_gmt_sec(t, tz_info, error_code);
 }
 
 
@@ -1298,7 +1303,7 @@ class Time_zone_offset : public Time_zone
 public:
   Time_zone_offset(long tz_offset_arg);
   virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t,
-                                    my_bool *in_dst_time_gap) const;
+                                    uint *error_code) const;
   virtual void   gmt_sec_to_TIME(MYSQL_TIME *tmp, my_time_t t) const;
   virtual const String * get_name() const;
   /*
@@ -1340,17 +1345,18 @@ Time_zone_offset::Time_zone_offset(long tz_offset_arg):
     TIME_to_gmt_sec()
       t               - pointer to MYSQL_TIME structure with local time
                         in broken-down representation.
-      in_dst_time_gap - pointer to bool which should be set to true if
-                        datetime  value passed doesn't really exist
-                        (i.e. falls into spring time-gap) and is not
-                        touched otherwise.
-                        It is not really used in this class.
+      error_code      - 0, if the conversion was successful;
+                        ER_WARN_DATA_OUT_OF_RANGE, if t contains datetime value
+                           which is out of TIMESTAMP range;
+                        ER_WARN_INVALID_TIMESTAMP, if t represents value which
+                           doesn't exists (falls into the spring time-gap).
 
   RETURN VALUE
-    Corresponding my_time_t value or 0 in case of error
+    Corresponding my_time_t value or 0 in case of error.
 */
+
 my_time_t
-Time_zone_offset::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap) const
+Time_zone_offset::TIME_to_gmt_sec(const MYSQL_TIME *t, uint *error_code) const
 {
   my_time_t local_t;
   int shift= 0;
@@ -1360,7 +1366,11 @@ Time_zone_offset::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap)
     us to make all validation checks here.
   */
   if (!validate_timestamp_range(t))
+  {
+    *error_code= ER_WARN_DATA_OUT_OF_RANGE;
     return 0;
+  }
+  *error_code= 0;
 
   /*
     Do a temporary shift of the boundary dates to avoid
@@ -1384,6 +1394,7 @@ Time_zone_offset::TIME_to_gmt_sec(const MYSQL_TIME *t, my_bool *in_dst_time_gap)
     return local_t;
 
   /* range error*/
+  *error_code= ER_WARN_DATA_OUT_OF_RANGE;
   return 0;
 }
 
@@ -2781,7 +2792,7 @@ main(int argc, char **argv)
             for (time_tmp.second=0; time_tmp.second<60; time_tmp.second+=25)
             {
               long not_used;
-              my_bool not_used_2;
+              uint not_used_2;
               t= (time_t)my_system_gmt_sec(&time_tmp, &not_used, &not_used_2);
               t1= (time_t)TIME_to_gmt_sec(&time_tmp, &tz_info, &not_used_2);
               if (t != t1)
diff --git a/sql/tztime.h b/sql/tztime.h
index f3fea485152..c7ee0ff36cc 100644
--- a/sql/tztime.h
+++ b/sql/tztime.h
@@ -45,11 +45,11 @@ public:
   /**
     Converts local time in broken down MYSQL_TIME representation to 
     my_time_t (UTC seconds since Epoch) represenation.
-    Returns 0 in case of error. Sets in_dst_time_gap to true if date provided
-    falls into spring time-gap (or lefts it untouched otherwise).
+    Returns 0 in case of error. May set error_code to ER_WARN_DATA_OUT_OF_RANGE
+    or ER_WARN_INVALID_TIMESTAMP, see TIME_to_timestamp())
   */
   virtual my_time_t TIME_to_gmt_sec(const MYSQL_TIME *t, 
-                                    my_bool *in_dst_time_gap) const = 0;
+                                    uint *error_code) const = 0;
   /**
     Converts time in my_time_t representation to local time in
     broken down MYSQL_TIME representation.
diff --git a/sql/uniques.cc b/sql/uniques.cc
index f5ab50c7c90..21aa66ec64d 100644
--- a/sql/uniques.cc
+++ b/sql/uniques.cc
@@ -48,6 +48,12 @@ int unique_write_to_file(uchar* key, element_count count, Unique *unique)
   return my_b_write(&unique->file, key, unique->size) ? 1 : 0;
 }
 
+int unique_write_to_file_with_count(uchar* key, element_count count, Unique *unique)
+{
+  return my_b_write(&unique->file, key, unique->size) ||
+         my_b_write(&unique->file, &count, sizeof(element_count)) ? 1 : 0;
+}
+
 int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique)
 {
   memcpy(unique->record_pointers, key, unique->size);
@@ -55,10 +61,28 @@ int unique_write_to_ptrs(uchar* key, element_count count, Unique *unique)
   return 0;
 }
 
+int unique_intersect_write_to_ptrs(uchar* key, element_count count, Unique *unique)
+{
+  if (count >= unique->min_dupl_count)
+  {
+    memcpy(unique->record_pointers, key, unique->size);
+    unique->record_pointers+=unique->size;
+  }
+  else
+    unique->filtered_out_elems++;
+  return 0;
+}
+
+
 Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
-	       uint size_arg, ulonglong max_in_memory_size_arg)
+	       uint size_arg, ulonglong max_in_memory_size_arg,
+               uint min_dupl_count_arg)
   :max_in_memory_size(max_in_memory_size_arg), size(size_arg), elements(0)
 {
+  min_dupl_count= min_dupl_count_arg;
+  full_size= size;
+  if (min_dupl_count_arg)
+    full_size+= sizeof(element_count);
   my_b_clear(&file);
   init_tree(&tree, (ulong) (max_in_memory_size / 16), 0, size, comp_func, 0,
             NULL, comp_func_fixed_arg);
@@ -126,7 +150,8 @@ inline double log2_n_fact(double x)
 */
 
 static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
-                                     uint *first, uint *last)
+                                     uint *first, uint *last,
+                                     uint compare_factor)
 {
   uint total_buf_elems= 0;
   for (uint *pbuf= first; pbuf <= last; pbuf++)
@@ -137,7 +162,7 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
 
   /* Using log2(n)=log(n)/log(2) formula */
   return 2*((double)total_buf_elems*elem_size) / IO_SIZE +
-     total_buf_elems*log((double) n_buffers) / (TIME_FOR_COMPARE_ROWID * M_LN2);
+     total_buf_elems*log((double) n_buffers) / (compare_factor * M_LN2);
 }
 
 
@@ -170,7 +195,8 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size,
 
 static double get_merge_many_buffs_cost(uint *buffer,
                                         uint maxbuffer, uint max_n_elems,
-                                        uint last_n_elems, int elem_size)
+                                        uint last_n_elems, int elem_size,
+                                        uint compare_factor)
 {
   register int i;
   double total_cost= 0.0;
@@ -197,19 +223,22 @@ static double get_merge_many_buffs_cost(uint *buffer,
       {
         total_cost+=get_merge_buffers_cost(buff_elems, elem_size,
                                            buff_elems + i,
-                                           buff_elems + i + MERGEBUFF-1);
+                                           buff_elems + i + MERGEBUFF-1,
+                                           compare_factor);
 	lastbuff++;
       }
       total_cost+=get_merge_buffers_cost(buff_elems, elem_size,
                                          buff_elems + i,
-                                         buff_elems + maxbuffer);
+                                         buff_elems + maxbuffer,
+                                         compare_factor);
       maxbuffer= lastbuff;
     }
   }
 
   /* Simulate final merge_buff call. */
   total_cost += get_merge_buffers_cost(buff_elems, elem_size,
-                                       buff_elems, buff_elems + maxbuffer);
+                                       buff_elems, buff_elems + maxbuffer,
+                                       compare_factor);
   return total_cost;
 }
 
@@ -224,7 +253,11 @@ static double get_merge_many_buffs_cost(uint *buffer,
                 to get # bytes needed.
       nkeys     #of elements in Unique
       key_size  size of each elements in bytes
-      max_in_memory_size amount of memory Unique will be allowed to use
+      max_in_memory_size   amount of memory Unique will be allowed to use
+      compare_factor   used to calculate cost of one comparison
+      write_fl  if the result must be saved written to disk
+      in_memory_elems  OUT estimate of the number of elements in memory
+                           if disk is not used  
 
   RETURN
     Cost in disk seeks.
@@ -261,15 +294,17 @@ static double get_merge_many_buffs_cost(uint *buffer,
       these will be random seeks.
 */
 
-double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
-                            ulonglong max_in_memory_size)
+double Unique::get_use_cost(uint *buffer, size_t nkeys, uint key_size,
+                            ulonglong max_in_memory_size,
+                            uint compare_factor,
+                            bool intersect_fl, bool *in_memory)
 {
-  ulong max_elements_in_tree;
-  ulong last_tree_elems;
+  size_t max_elements_in_tree;
+  size_t last_tree_elems;
   int   n_full_trees; /* number of trees in unique - 1 */
   double result;
 
-  max_elements_in_tree= ((ulong) max_in_memory_size /
+  max_elements_in_tree= ((size_t) max_in_memory_size /
                          ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size));
 
   n_full_trees=    nkeys / max_elements_in_tree;
@@ -279,11 +314,15 @@ double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
   result= 2*log2_n_fact(last_tree_elems + 1.0);
   if (n_full_trees)
     result+= n_full_trees * log2_n_fact(max_elements_in_tree + 1.0);
-  result /= TIME_FOR_COMPARE_ROWID;
+  result /= compare_factor;
 
-  DBUG_PRINT("info",("unique trees sizes: %u=%u*%lu + %lu", nkeys,
-                     n_full_trees, n_full_trees?max_elements_in_tree:0,
-                     last_tree_elems));
+  DBUG_PRINT("info",("unique trees sizes: %u=%u*%u + %u", (uint)nkeys,
+                     (uint)n_full_trees, 
+                     (uint)(n_full_trees?max_elements_in_tree:0),
+                     (uint)last_tree_elems));
+
+  if (in_memory)
+    *in_memory= !n_full_trees;
 
   if (!n_full_trees)
     return result;
@@ -298,12 +337,12 @@ double Unique::get_use_cost(uint *buffer, uint nkeys, uint key_size,
   result += DISK_SEEK_BASE_COST * ceil(((double) key_size)*last_tree_elems / IO_SIZE);
 
   /* Cost of merge */
+  if (intersect_fl)
+    key_size+= sizeof(element_count);
   double merge_cost= get_merge_many_buffs_cost(buffer, n_full_trees,
                                                max_elements_in_tree,
-                                               last_tree_elems, key_size);
-  if (merge_cost < 0.0)
-    return merge_cost;
-
+                                               last_tree_elems, key_size,
+                                               compare_factor);
   result += merge_cost;
   /*
     Add cost of reading the resulting sequence, assuming there were no
@@ -330,7 +369,10 @@ bool Unique::flush()
   file_ptr.count=tree.elements_in_tree;
   file_ptr.file_pos=my_b_tell(&file);
 
-  if (tree_walk(&tree, (tree_walk_action) unique_write_to_file,
+  tree_walk_action action= min_dupl_count ?
+		           (tree_walk_action) unique_write_to_file_with_count :
+		           (tree_walk_action) unique_write_to_file;
+  if (tree_walk(&tree, action,
 		(void*) this, left_root_right) ||
       insert_dynamic(&file_ptrs, (uchar*) &file_ptr))
     return 1;
@@ -360,6 +402,7 @@ Unique::reset()
     reinit_io_cache(&file, WRITE_CACHE, 0L, 0, 1);
   }
   elements= 0;
+  tree.flag= 0;
 }
 
 /*
@@ -579,15 +622,19 @@ bool Unique::get(TABLE *table)
 {
   SORTPARAM sort_param;
   table->sort.found_records=elements+tree.elements_in_tree;
-
   if (my_b_tell(&file) == 0)
   {
     /* Whole tree is in memory;  Don't use disk if you don't need to */
     if ((record_pointers=table->sort.record_pointers= (uchar*)
 	 my_malloc(size * tree.elements_in_tree, MYF(0))))
     {
-      (void) tree_walk(&tree, (tree_walk_action) unique_write_to_ptrs,
+      tree_walk_action action= min_dupl_count ?
+		         (tree_walk_action) unique_intersect_write_to_ptrs :
+		         (tree_walk_action) unique_write_to_ptrs;
+      filtered_out_elems= 0;
+      (void) tree_walk(&tree, action,
 		       this, left_root_right);
+      table->sort.found_records-= filtered_out_elems;
       return 0;
     }
   }
@@ -617,7 +664,9 @@ bool Unique::get(TABLE *table)
   sort_param.max_rows= elements;
   sort_param.sort_form=table;
   sort_param.rec_length= sort_param.sort_length= sort_param.ref_length=
-    size;
+   full_size;
+  sort_param.min_dupl_count= min_dupl_count;
+  sort_param.res_length= 0;
   sort_param.keys= (uint) (max_in_memory_size / sort_param.sort_length);
   sort_param.not_killable=1;
 
@@ -638,8 +687,9 @@ bool Unique::get(TABLE *table)
   if (flush_io_cache(&file) ||
       reinit_io_cache(&file,READ_CACHE,0L,0,0))
     goto err;
-  if (merge_buffers(&sort_param, &file, outfile, sort_buffer, file_ptr,
-		    file_ptr, file_ptr+maxbuffer,0))
+  sort_param.res_length= sort_param.rec_length-
+                         (min_dupl_count ? sizeof(min_dupl_count) : 0);
+  if (merge_index(&sort_param, sort_buffer, file_ptr, maxbuffer, &file, outfile))
     goto err;
   error=0;
 err:
@@ -654,3 +704,5 @@ err:
   outfile->end_of_file=save_pos;
   return error;
 }
+
+
diff --git a/sql/unireg.cc b/sql/unireg.cc
index 2d75d1a6356..ef9729bcda4 100644
--- a/sql/unireg.cc
+++ b/sql/unireg.cc
@@ -1107,7 +1107,6 @@ static bool make_empty_rec(THD *thd, File file,enum legacy_db_type table_type,
   }
 
   table.in_use= thd;
-  table.s->db_low_byte_first= handler->low_byte_first();
   table.s->blob_ptr_size= portable_sizeof_char_ptr;
 
   null_count=0;
diff --git a/sql/unireg.h b/sql/unireg.h
index bafd9c96e47..f1066dbccb8 100644
--- a/sql/unireg.h
+++ b/sql/unireg.h
@@ -55,7 +55,6 @@ typedef struct st_ha_create_information HA_CREATE_INFO;
                        ER_ERROR_FIRST])
 #define ER_THD_OR_DEFAULT(thd,X) ((thd) ? ER_THD(thd, X) : ER_DEFAULT(X))
 
-
 #define ME_INFO (ME_HOLDTANG+ME_OLDWIN+ME_NOREFRESH)
 #define ME_ERROR (ME_BELL+ME_OLDWIN+ME_NOREFRESH)
 #define MYF_RW MYF(MY_WME+MY_NABP)		/* Vid my_read & my_write */
diff --git a/sql/winservice.c b/sql/winservice.c
new file mode 100644
index 00000000000..562f047fa79
--- /dev/null
+++ b/sql/winservice.c
@@ -0,0 +1,247 @@
+/*
+  Get Properties of an existing mysqld Windows service 
+*/
+
+#include <windows.h>
+#include <winsvc.h>
+#include "winservice.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+/*
+  Get version from an executable file
+*/
+void get_file_version(const char *path, int *major, int *minor, int *patch)
+{
+  DWORD version_handle;
+  char *ver= 0;
+  VS_FIXEDFILEINFO info;
+  UINT len;
+  DWORD size;
+  void *p;
+  *major= *minor= *patch= 0;
+
+  size= GetFileVersionInfoSize(path, &version_handle);
+  if (size == 0) 
+    return;
+  ver= (char *)malloc(size);
+  if(!GetFileVersionInfo(path, version_handle, size, ver))
+    goto end;
+
+  if(!VerQueryValue(ver,"\\",&p,&len))
+    goto end;
+  memcpy(&info,p ,sizeof(VS_FIXEDFILEINFO));
+
+  *major= (info.dwFileVersionMS & 0xFFFF0000) >> 16;
+  *minor= (info.dwFileVersionMS & 0x0000FFFF);
+  *patch= (info.dwFileVersionLS & 0xFFFF0000) >> 16;
+end:
+  free(ver);
+}
+
+void normalize_path(char *path, size_t size)
+{
+  char buf[MAX_PATH];
+  if (*path== '"')
+  {
+    char *p;
+    strcpy_s(buf, MAX_PATH, path+1);
+    p= strchr(buf, '"');
+    if (p) 
+      *p=0;
+  }
+  else
+    strcpy_s(buf, MAX_PATH,  path);
+  GetFullPathName(buf, MAX_PATH, buf, NULL);
+  strcpy_s(path, size,  buf);
+}
+
+/*
+  Retrieve some properties from windows mysqld service binary path.
+  We're interested in ini file location and datadir, and also in version of 
+  the data. We tolerate missing mysqld.exe.
+
+  Note that this function carefully avoids using mysql libraries (e.g dbug), 
+  since it is  used in unusual environments (windows installer, MFC), where we
+  do not have much control over how threads are created and destroyed, so we 
+  cannot assume MySQL thread initilization here.
+*/
+int get_mysql_service_properties(const wchar_t *bin_path, 
+  mysqld_service_properties *props)
+{
+  int numargs;
+  wchar_t mysqld_path[MAX_PATH + 4];
+  wchar_t *file_part;
+  wchar_t **args= NULL;
+  int retval= 1;
+  BOOL have_inifile;
+
+  props->datadir[0]= 0;
+  props->inifile[0]= 0;
+  props->mysqld_exe[0]= 0;
+  props->version_major= 0;
+  props->version_minor= 0;
+  props->version_patch= 0;
+
+  args= CommandLineToArgvW(bin_path, &numargs);
+  if(numargs == 2)
+  {
+    /*
+      There are rare cases where service config does not have 
+      --defaults-filein the binary parth . There services were registered with 
+      plain mysqld --install, the data directory is next to "bin" in this case.
+      Service name (second parameter) must be MySQL.
+    */
+    if(wcscmp(args[1], L"MySQL") != 0)
+      goto end;
+    have_inifile= FALSE;
+  }
+  else if(numargs == 3)
+  {
+    have_inifile= TRUE;
+  }
+  else
+  {
+    goto end;
+  }
+
+  if(have_inifile && wcsncmp(args[1], L"--defaults-file=", 16) != 0)
+    goto end;
+
+  GetFullPathNameW(args[0], MAX_PATH, mysqld_path, &file_part);
+
+  if(wcsstr(mysqld_path, L".exe") == NULL)
+    wcscat(mysqld_path, L".exe");
+
+  if(wcsicmp(file_part, L"mysqld.exe") != 0 && 
+    wcsicmp(file_part, L"mysqld.exe") != 0 &&
+    wcsicmp(file_part, L"mysqld-nt.exe") != 0)
+  {
+    /* The service executable is not mysqld. */
+    goto end;
+  }
+
+  wcstombs(props->mysqld_exe, mysqld_path, MAX_PATH);
+  /* If mysqld.exe exists, try to get its version from executable */
+  if (GetFileAttributes(props->mysqld_exe) != INVALID_FILE_ATTRIBUTES)
+  {
+     get_file_version(props->mysqld_exe, &props->version_major, 
+      &props->version_minor, &props->version_patch);
+  }
+
+  if (have_inifile)
+  {
+    /* We have --defaults-file in service definition. */
+    wcstombs(props->inifile, args[1]+16, MAX_PATH);
+    normalize_path(props->inifile, MAX_PATH);
+    if (GetFileAttributes(props->inifile) != INVALID_FILE_ATTRIBUTES)
+    {
+      GetPrivateProfileString("mysqld", "datadir", NULL, props->datadir, MAX_PATH, 
+        props->inifile);
+    }
+    else
+    {
+      /*
+        Service will start even with invalid .ini file, using lookup for
+        datadir relative to mysqld.exe. This is equivalent to the case no ini
+        file used.
+      */
+      props->inifile[0]= 0;
+      have_inifile= FALSE;
+    }
+  }
+
+  if(!have_inifile)
+  {
+    /*
+      Hard, although a rare case, we're guessing datadir and defaults-file.
+      On Windows, defaults-file is traditionally install-root\my.ini 
+      and datadir is install-root\data
+    */
+    char install_root[MAX_PATH];
+    int i;
+    char *p;
+
+    /*
+      Get the  install root(parent of bin directory where mysqld.exe)
+      is located.
+    */
+    strcpy_s(install_root, MAX_PATH, props->mysqld_exe);
+    for (i=0; i< 2; i++)
+    {
+      p= strrchr(install_root, '\\');
+      if(!p)
+        goto end;
+      *p= 0;
+    }
+
+    /* Look for my.ini, my.cnf in the install root */
+    sprintf_s(props->inifile, MAX_PATH, "%s\\my.ini", install_root);
+    if (GetFileAttributes(props->inifile) == INVALID_FILE_ATTRIBUTES)
+    {
+      sprintf_s(props->inifile, MAX_PATH, "%s\\my.cnf", install_root);
+    }
+    if (GetFileAttributes(props->inifile) != INVALID_FILE_ATTRIBUTES)
+    {
+      /* Ini file found, get datadir from there */
+      GetPrivateProfileString("mysqld", "datadir", NULL, props->datadir,
+        MAX_PATH, props->inifile);
+    }
+    else
+    {
+      /* No ini file */
+      props->inifile[0]= 0;
+    }
+
+    /* Try datadir in install directory.*/
+    if (props->datadir[0] == 0)
+    {
+      sprintf_s(props->datadir, MAX_PATH, "%s\\data", install_root);
+    }
+  }
+
+  if (props->datadir[0])
+  {
+    normalize_path(props->datadir, MAX_PATH);
+    /* Check if datadir really exists */
+    if (GetFileAttributes(props->datadir) == INVALID_FILE_ATTRIBUTES)
+      goto end;
+  }
+  else
+  {
+    /* There is no datadir in ini file,  bail out.*/
+    goto end;
+  }
+
+  /*
+    If version could not be determined so far, try mysql_upgrade_info in 
+    database directory.
+  */
+  if(props->version_major == 0)
+  {
+    char buf[MAX_PATH];
+    FILE *mysql_upgrade_info;
+
+    sprintf_s(buf, MAX_PATH, "%s\\mysql_upgrade_info", props->datadir);
+    mysql_upgrade_info= fopen(buf, "r");
+    if(mysql_upgrade_info)
+    {
+      if (fgets(buf, MAX_PATH, mysql_upgrade_info))
+      {
+        int major,minor,patch;
+        if (sscanf(buf, "%d.%d.%d", &major, &minor, &patch) == 3)
+        {
+          props->version_major= major;
+          props->version_minor= minor;
+          props->version_patch= patch;
+        }
+      }
+    }
+  }
+  retval = 0;
+end:
+  LocalFree((HLOCAL)args);
+  return retval;
+}
+\ No newline at end of file
diff --git a/sql/winservice.h b/sql/winservice.h
new file mode 100644
index 00000000000..8957413783f
--- /dev/null
+++ b/sql/winservice.h
@@ -0,0 +1,24 @@
+/*
+  Extract properties of a windows service binary path
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <windows.h> 
+typedef struct mysqld_service_properties_st
+{
+  char mysqld_exe[MAX_PATH];
+  char inifile[MAX_PATH];
+  char datadir[MAX_PATH];
+  int  version_major;
+  int  version_minor;
+  int  version_patch;
+} mysqld_service_properties;
+
+extern int get_mysql_service_properties(const wchar_t *bin_path, 
+  mysqld_service_properties *props);
+
+#ifdef __cplusplus
+}
+#endif
author	Sergei Golubchik <sergii@pisem.net>	2011-10-19 21:45:18 +0200
committer	Sergei Golubchik <sergii@pisem.net>	2011-10-19 21:45:18 +0200
commit	76f0b94bb0b2994d639353530c5b251d0f1a204b (patch)
tree	9ed50628aac34f89a37637bab2fc4915b86b5eb4 /sql
parent	4e46d8e5bff140f2549841167dc4b65a3c0a645d (diff)
parent	5dc1a2231f55bacc9aaf0e24816f3d9c2ee1f21d (diff)
download	mariadb-git-76f0b94bb0b2994d639353530c5b251d0f1a204b.tar.gz