1 files changed, 194 insertions, 107 deletions
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
index 52d6a784e15..6f2de3796c4 100644
--- a/sql/opt_subselect.cc
+++ b/sql/opt_subselect.cc
@@ -448,7 +448,8 @@ static bool convert_subq_to_sj(JOIN *parent_join, Item_in_subselect *subq_pred);
 static bool convert_subq_to_jtbm(JOIN *parent_join, 
                                  Item_in_subselect *subq_pred, bool *remove);
 static TABLE_LIST *alloc_join_nest(THD *thd);
-static uint get_tmp_table_rec_length(Ref_ptr_array p_list, uint elements);
+static uint get_tmp_table_rec_length(Ref_ptr_array p_list, uint elements,
+                                     bool *blobs_used);
 bool find_eq_ref_candidate(TABLE *table, table_map sj_inner_tables);
 static SJ_MATERIALIZATION_INFO *
 at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
@@ -1467,9 +1468,9 @@ void get_delayed_table_estimates(TABLE *table,
   /* Calculate cost of scanning the temptable */
   double data_size= COST_MULT(item->jtbm_record_count,
                               hash_sj_engine->tmp_table->s->reclength);
-  /* Do like in handler::scan_time() */
+  /* Do like in handler::ha_scan_and_compare_time, but ignore the where cost */
   *scan_time= ((data_size/table->file->stats.block_size+2) *
-               table->file->avg_io_cost());
+               table->file->avg_io_cost()) + *out_rows * ROW_COPY_COST;
 }
 
 
@@ -2564,27 +2565,28 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
         /*
           Calculate temporary table parameters and usage costs
         */
+        bool blobs_used;
         uint rowlen= get_tmp_table_rec_length(subq_select->ref_pointer_array,
-                                              subq_select->item_list.elements);
-        double lookup_cost= get_tmp_table_lookup_cost(join->thd,
-                                                      subjoin_out_rows, rowlen);
-        double write_cost= get_tmp_table_write_cost(join->thd,
-                                                    subjoin_out_rows, rowlen);
-
+                                              subq_select->item_list.elements,
+                                              &blobs_used);
+        TMPTABLE_COSTS cost= get_tmp_table_costs(join->thd,
+                                                 subjoin_out_rows, rowlen,
+                                                 blobs_used);
         /*
           Let materialization cost include the cost to write the data into the
           temporary table:
         */ 
-        sjm->materialization_cost.add_io(subjoin_out_rows, write_cost);
-        
+        sjm->materialization_cost.add_io(subjoin_out_rows, cost.write);
+        sjm->materialization_cost.copy_cost+= cost.create;
+
         /*
           Set the cost to do a full scan of the temptable (will need this to 
           consider doing sjm-scan):
         */ 
         sjm->scan_cost.reset();
-        sjm->scan_cost.add_io(sjm->rows, lookup_cost);
+        sjm->scan_cost.add_io(sjm->rows, cost.lookup);
 
-        sjm->lookup_cost.convert_from_cost(lookup_cost);
+        sjm->lookup_cost.convert_from_cost(cost.lookup);
         sj_nest->sj_mat_info= sjm;
         DBUG_EXECUTE("opt", print_sjm(sjm););
       }
@@ -2612,11 +2614,13 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map)
     Length of the temptable record, in bytes
 */
 
-static uint get_tmp_table_rec_length(Ref_ptr_array p_items, uint elements)
+static uint get_tmp_table_rec_length(Ref_ptr_array p_items, uint elements,
+                                     bool *blobs_used)
 {
   uint len= 0;
   Item *item;
-  //List_iterator<Item> it(items);
+
+  *blobs_used= 0;
   for (uint i= 0; i < elements ; i++)
   {
     item = p_items[i];
@@ -2639,6 +2643,8 @@ static uint get_tmp_table_rec_length(Ref_ptr_array p_items, uint elements)
         len += 8;
       else
         len += item->max_length;
+      if (item->max_length > MAX_FIELD_VARCHARLENGTH)
+        *blobs_used= 1;
       break;
     case DECIMAL_RESULT:
       len += 10;
@@ -2654,46 +2660,48 @@ static uint get_tmp_table_rec_length(Ref_ptr_array p_items, uint elements)
 
 
 /**
-  The cost of a lookup into a unique hash/btree index on a temporary table
-  with 'row_count' rows each of size 'row_size'.
+  The cost of a create, write and read into a unique hash/btree index on
+  a temporary table with 'row_count' rows each of size 'row_size'.
 
   @param thd  current query context
   @param row_count  number of rows in the temp table
   @param row_size   average size in bytes of the rows
 
-  @return  the cost of one lookup
+  @return  The cost of using the temporary table
+
+  TODO:
+  This is an optimistic estimate. We are not taking into account:
+   - That we first write into a memory and then overflow to disk.
+   - If binary trees would be used for heap tables.
+   - The addition cost of writing a row to memory/disk and possible
+     index reorganization.
 */
 
-double
-get_tmp_table_lookup_cost(THD *thd, double row_count, uint row_size)
+TMPTABLE_COSTS
+get_tmp_table_costs(THD *thd, double row_count, uint row_size, bool blobs_used)
 {
-  if (row_count > thd->variables.max_heap_table_size / (double) row_size)
-    return (double) DISK_TEMPTABLE_LOOKUP_COST;
-  else
-    return (double) HEAP_TEMPTABLE_LOOKUP_COST;
-}
+  TMPTABLE_COSTS cost;
 
-/**
-  The cost of writing a row into a temporary table with 'row_count' unique
-  rows each of size 'row_size'.
+  /* From heap_prepare_hp_create_info(), assuming one hash key used */
+  row_size+= sizeof(char*)*2;
+  row_size= MY_ALIGN(MY_MAX(row_size, sizeof(char*)) + 1, sizeof(char*));
 
-  @param thd  current query context
-  @param row_count  number of rows in the temp table
-  @param row_size   average size in bytes of the rows
-
-  @return  the cost of writing one row
-*/
-
-double
-get_tmp_table_write_cost(THD *thd, double row_count, uint row_size)
-{
-  double lookup_cost= get_tmp_table_lookup_cost(thd, row_count, row_size);
-  /*
-    TODO:
-    This is an optimistic estimate. Add additional costs resulting from
-    actually writing the row to memory/disk and possible index reorganization.
-  */
-  return lookup_cost;
+  if (row_count > thd->variables.max_heap_table_size / (double) row_size ||
+      blobs_used)
+  {
+    /* Disk based table */
+    cost.lookup= (DISK_TEMPTABLE_LOOKUP_COST *
+                  cache_hit_cost(thd->variables.optimizer_cache_hit_ratio));
+    cost.write=   cost.lookup;
+    cost.create=  DISK_TEMPTABLE_CREATE_COST;
+  }
+  else
+  {
+    cost.lookup=  HEAP_TEMPTABLE_LOOKUP_COST;
+    cost.write=   cost.lookup;
+    cost.create=  HEAP_TEMPTABLE_CREATE_COST;
+  }
+  return cost;
 }
 
 
@@ -2938,9 +2946,16 @@ void optimize_semi_joins(JOIN *join, table_map remaining_tables, uint idx,
                 2. using strategy Z is cheaper, but it only removes
                    fanout from semijoin X.
                 3. We have no clue what to do about fanount of semi-join Y.
+
+          For the first iteration read_time will always be bigger than
+          *current_read_time (as the 'strategy' is an addition to the
+          chosen plan) .  If a strategy was picked
+          (dusp_producing_tables & handled_fanout is true), then
+          *current_read_time is updated and the cost for the next
+          strategy can be smaller than *current_read_time.
         */
         if ((dups_producing_tables & handled_fanout) ||
-            (read_time < *current_read_time &&
+            (read_time + COST_EPS < *current_read_time &&
              !(handled_fanout & pos->inner_tables_handled_with_other_sjs)))
         {
           DBUG_ASSERT(pos->sj_strategy != sj_strategy);
@@ -3211,8 +3226,6 @@ bool Sj_materialization_picker::check_qep(JOIN *join,
                        disable_jbuf, prefix_rec_count, &curpos, &dummy);
       prefix_rec_count= COST_MULT(prefix_rec_count, curpos.records_read);
       prefix_cost= COST_ADD(prefix_cost, curpos.read_time);
-      prefix_cost= COST_ADD(prefix_cost,
-                            prefix_rec_count / TIME_FOR_COMPARE);
       //TODO: take into account join condition selectivity here
     }
 
@@ -3237,8 +3250,9 @@ bool Sj_materialization_picker::check_qep(JOIN *join,
     *handled_fanout= mat_nest->sj_inner_tables;
     if (unlikely(trace.trace_started()))
     {
-      trace.add("records", *record_count);
-      trace.add("read_time", *read_time);
+      trace.
+        add("records", *record_count).
+        add("cost", *read_time);
     }
     return TRUE;
   }
@@ -3339,7 +3353,7 @@ bool LooseScan_picker::check_qep(JOIN *join,
     {
       trace.
         add("records", *record_count).
-        add("read_time", *read_time);
+        add("cost", *read_time);
     }
     return TRUE;
   }
@@ -3456,8 +3470,9 @@ bool Firstmatch_picker::check_qep(JOIN *join,
         *strategy= SJ_OPT_FIRST_MATCH;
         if (unlikely(trace.trace_started()))
         {
-          trace.add("records", *record_count);
-          trace.add("read_time", *read_time);
+          trace.
+            add("records", *record_count).
+            add("cost", *read_time);
         }
         return TRUE;
       }
@@ -3469,6 +3484,56 @@ bool Firstmatch_picker::check_qep(JOIN *join,
 }
 
 
+/*
+  Duplicate_weedout strategy is described at
+  https://mariadb.com/kb/en/duplicateweedout-strategy/
+
+  The idea is that if one has a subquery of type:
+
+  select *
+    from Country
+  where
+    Country.code IN (select City.Country
+                     from City
+                     where
+                     ...)
+
+  Before semi join optimization it was executed this way:
+  - Scan rows in Country
+    - For each accepted row, execute the sub query with
+      'Country.code = City.Country' added to the WHERE clause and with
+      LIMIT 1
+
+  With semi join optimization it can be converted to the following semi join.
+
+  select * from Country semi-join City
+  where Country.code = City.Country and ...
+
+  This is executed as:
+
+  - Scan rows in Country
+    - Scan rows in City with 'Country.code = City.Country' added to the
+      subquery WHERE clause. Stop scanning after the first match.
+
+  or
+
+  - Create temporary table to store City.Country (with a unique key)
+  - Scan rows in City (according to plan for City) and put them into the
+    temporary table
+  - Scan the temporary table
+    - Do index lookup in Country table with City.Country
+
+With Duplicate_weedout we would try to instead do:
+
+ - Create temporary table to hold unique rowid's for the Country
+  - Scan rows in City (according to plan for City)
+    - Scan rows in Country (according to plan for Country)
+      - Write Country.id rowid to temporary table. If there was no
+        conflicting row in the temporary table, accept the row combination.
+  - Delete temporary table
+*/
+
+
 void Duplicate_weedout_picker::set_from_prev(POSITION *prev)
 {
   if (prev->dups_weedout_picker.is_used)
@@ -3537,8 +3602,6 @@ bool Duplicate_weedout_picker::check_qep(JOIN *join,
     double sj_inner_fanout= 1.0;
     double sj_outer_fanout= 1.0;
     uint temptable_rec_size;
-    Json_writer_object trace(join->thd);
-    trace.add("strategy", "DuplicateWeedout");
 
     if (first_tab == join->const_tables)
     {
@@ -3554,14 +3617,11 @@ bool Duplicate_weedout_picker::check_qep(JOIN *join,
     }
     
     table_map dups_removed_fanout= 0;
-    double current_fanout= prefix_rec_count;
     for (uint j= first_dupsweedout_table; j <= idx; j++)
     {
       POSITION *p= join->positions + j;
-      current_fanout= COST_MULT(current_fanout, p->records_read);
-      dups_cost= COST_ADD(dups_cost,
-                          COST_ADD(p->read_time,
-                                   current_fanout / TIME_FOR_COMPARE));
+      dups_cost= COST_ADD(dups_cost, p->read_time);
+
       if (p->table->emb_sj_nest)
       {
         sj_inner_fanout= COST_MULT(sj_inner_fanout, p->records_read);
@@ -3583,29 +3643,33 @@ bool Duplicate_weedout_picker::check_qep(JOIN *join,
       - sj_outer_fanout table writes
       - sj_inner_fanout*sj_outer_fanout lookups.
     */
-    double one_lookup_cost= get_tmp_table_lookup_cost(join->thd,
-                                                      sj_outer_fanout,
-                                                      temptable_rec_size);
-    double one_write_cost= get_tmp_table_write_cost(join->thd,
-                                                    sj_outer_fanout,
-                                                    temptable_rec_size);
-
-    double write_cost= COST_MULT(join->positions[first_tab].prefix_record_count,
-                                 sj_outer_fanout * one_write_cost);
+    TMPTABLE_COSTS one_cost= get_tmp_table_costs(join->thd,
+                                                 sj_outer_fanout,
+                                                 temptable_rec_size,
+                                                 0);
+    double write_cost=
+      COST_ADD(one_cost.create,
+               COST_MULT(join->positions[first_tab].prefix_record_count,
+                         sj_outer_fanout * one_cost.write));
     double full_lookup_cost=
-             COST_MULT(join->positions[first_tab].prefix_record_count,
-                       COST_MULT(sj_outer_fanout,
-                                 sj_inner_fanout * one_lookup_cost));
-    dups_cost= COST_ADD(dups_cost, COST_ADD(write_cost, full_lookup_cost));
+      COST_MULT(join->positions[first_tab].prefix_record_count,
+                COST_MULT(sj_outer_fanout,
+                          sj_inner_fanout * one_cost.lookup));
+    *read_time= COST_ADD(dups_cost, COST_ADD(write_cost, full_lookup_cost));
     
-    *read_time= dups_cost;
     *record_count= prefix_rec_count * sj_outer_fanout;
     *handled_fanout= dups_removed_fanout;
     *strategy= SJ_OPT_DUPS_WEEDOUT;
-    if (unlikely(trace.trace_started()))
+    if (unlikely(join->thd->trace_started()))
     {
-      trace.add("records", *record_count);
-      trace.add("read_time", *read_time);
+      Json_writer_object trace(join->thd);
+      trace.
+        add("strategy", "DuplicateWeedout").
+        add("records", *record_count).
+        add("dups_cost", dups_cost).
+        add("write_cost", write_cost).
+        add("full_lookup_cost", full_lookup_cost).
+        add("total_cost", *read_time);
     }
     return TRUE;
   }
@@ -3796,6 +3860,8 @@ at_sjmat_pos(const JOIN *join, table_map remaining_tables, const JOIN_TAB *tab,
 
 static void recalculate_prefix_record_count(JOIN *join, uint start, uint end)
 {
+  DBUG_ASSERT(start >= join->const_tables);
+
   for (uint j= start; j < end ;j++)
   {
     double prefix_count;
@@ -3967,7 +4033,7 @@ void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
       join->best_positions[first].n_sj_tables= tablenr - first + 1;
       POSITION dummy; // For loose scan paths
       double record_count= (first== join->const_tables)? 1.0: 
-                           join->best_positions[tablenr - 1].prefix_record_count;
+                           join->best_positions[first - 1].prefix_record_count;
       
       table_map rem_tables= remaining_tables;
       uint idx;
@@ -4008,7 +4074,7 @@ void fix_semijoin_strategies_for_picked_join_order(JOIN *join)
       POSITION *first_pos= join->best_positions + first;
       POSITION loose_scan_pos; // For loose scan paths
       double record_count= (first== join->const_tables)? 1.0: 
-                           join->best_positions[tablenr - 1].prefix_record_count;
+                           join->best_positions[first - 1].prefix_record_count;
       
       table_map rem_tables= remaining_tables;
       uint idx;
@@ -6532,18 +6598,14 @@ bool JOIN::choose_subquery_plan(table_map join_tables)
     IN/ALL/ANY optimizations are not applicable for so called fake select
     (this select exists only to filter results of union if it is needed).
   */
-  if (select_lex == select_lex->master_unit()->fake_select_lex)
-    return 0;
-
-  if (is_in_subquery())
-  {
-    in_subs= unit->item->get_IN_subquery();
-    if (in_subs->create_in_to_exists_cond(this))
-      return true;
-  }
-  else
+  if (select_lex == select_lex->master_unit()->fake_select_lex ||
+      likely(!is_in_subquery()))
     return false;
 
+  in_subs= unit->item->get_IN_subquery();
+  if (in_subs->create_in_to_exists_cond(this))
+    return true;
+
   /* A strategy must be chosen earlier. */
   DBUG_ASSERT(in_subs->has_strategy());
   DBUG_ASSERT(in_to_exists_where || in_to_exists_having);
@@ -6574,6 +6636,7 @@ bool JOIN::choose_subquery_plan(table_map join_tables)
     /* The cost of the IN->EXISTS strategy. */
     double in_exists_strategy_cost;
     double dummy;
+    const char *strategy;
 
     /*
       A. Estimate the number of rows of the outer table that will be filtered
@@ -6646,39 +6709,57 @@ bool JOIN::choose_subquery_plan(table_map join_tables)
       C. Compute execution costs.
     */
     /* C.1 Compute the cost of the materialization strategy. */
-    //uint rowlen= get_tmp_table_rec_length(unit->first_select()->item_list);
-    uint rowlen= get_tmp_table_rec_length(ref_ptrs, 
-                                          select_lex->item_list.elements);
-    /* The cost of writing one row into the temporary table. */
-    double write_cost= get_tmp_table_write_cost(thd, inner_record_count_1,
-                                                rowlen);
-    /* The cost of a lookup into the unique index of the materialized table. */
-    double lookup_cost= get_tmp_table_lookup_cost(thd, inner_record_count_1,
-                                                  rowlen);
+    bool blobs_used;
+    uint rowlen= get_tmp_table_rec_length(ref_ptrs,
+                                          select_lex->item_list.elements,
+                                          &blobs_used);
+    /* The cost of using the temp table */
+    TMPTABLE_COSTS cost= get_tmp_table_costs(thd, inner_record_count_1,
+                                             rowlen, blobs_used);
     /*
       The cost of executing the subquery and storing its result in an indexed
       temporary table.
     */
-    double materialization_cost= COST_ADD(inner_read_time_1,
-                                          COST_MULT(write_cost,
-                                                    inner_record_count_1));
+    double materialization_cost=
+      COST_ADD(cost.create,
+               COST_ADD(inner_read_time_1,
+                        COST_MULT(cost.write, inner_record_count_1)));
 
-    materialize_strategy_cost= COST_ADD(materialization_cost,
-                                        COST_MULT(outer_lookup_keys,
-                                                  lookup_cost));
+    materialize_strategy_cost=
+      COST_ADD(materialization_cost,
+               COST_MULT(outer_lookup_keys, cost.lookup));
 
     /* C.2 Compute the cost of the IN=>EXISTS strategy. */
-    in_exists_strategy_cost= COST_MULT(outer_lookup_keys, inner_read_time_2);
+    in_exists_strategy_cost=
+      COST_MULT(outer_lookup_keys, inner_read_time_2);
 
     /* C.3 Compare the costs and choose the cheaper strategy. */
     if (materialize_strategy_cost >= in_exists_strategy_cost)
+    {
       in_subs->set_strategy(SUBS_IN_TO_EXISTS);
+      strategy= "in_to_exists";
+    }
     else
+    {
       in_subs->set_strategy(SUBS_MATERIALIZATION);
+      strategy= "materialization";
+    }
+    if (unlikely(thd->trace_started()))
+    {
+      Json_writer_object wrapper(thd);
+      Json_writer_object trace_subquery(thd, "subquery_plan");
+      trace_subquery.
+        add("records", inner_record_count_1).
+        add("materialization_cost", materialize_strategy_cost).
+        add("in_exist_cost", in_exists_strategy_cost).
+        add("choosen", strategy);
+    }
 
     DBUG_PRINT("info",
-               ("mat_strategy_cost: %.2f, mat_cost: %.2f, write_cost: %.2f, lookup_cost: %.2f",
-                materialize_strategy_cost, materialization_cost, write_cost, lookup_cost));
+               ("mat_strategy_cost: %.2f  mat_cost: %.2f  write_cost: %.2f  "
+                "lookup_cost: %.2f",
+                materialize_strategy_cost, materialization_cost, cost.write,
+                cost.lookup));
     DBUG_PRINT("info",
                ("inx_strategy_cost: %.2f, inner_read_time_2: %.2f",
                 in_exists_strategy_cost, inner_read_time_2));
@@ -6704,6 +6785,12 @@ bool JOIN::choose_subquery_plan(table_map join_tables)
       implementation, fall back to IN-TO-EXISTS.
     */
     in_subs->set_strategy(SUBS_IN_TO_EXISTS);
+
+    if (unlikely(thd->trace_started()))
+    {
+      Json_writer_object trace_subquery(thd, "subquery_plan_revert");
+      trace_subquery.add("choosen", "in_to_exists");
+    }
   }
 
   if (in_subs->test_strategy(SUBS_MATERIALIZATION))