diff options
Diffstat (limited to 'innobase/row/row0sel.c')
-rw-r--r-- | innobase/row/row0sel.c | 2732 |
1 files changed, 2732 insertions, 0 deletions
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c new file mode 100644 index 00000000000..bd7af5743d8 --- /dev/null +++ b/innobase/row/row0sel.c @@ -0,0 +1,2732 @@ +/******************************************************* +Select + +(c) 1997 Innobase Oy + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#include "row0sel.h" + +#ifdef UNIV_NONINL +#include "row0sel.ic" +#endif + +#include "dict0dict.h" +#include "dict0boot.h" +#include "trx0undo.h" +#include "trx0trx.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "mach0data.h" +#include "que0que.h" +#include "row0upd.h" +#include "row0row.h" +#include "row0vers.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "eval0eval.h" +#include "pars0sym.h" +#include "pars0pars.h" +#include "row0mysql.h" + +/* Maximum number of rows to prefetch; MySQL interface has another parameter */ +#define SEL_MAX_N_PREFETCH 16 + +/* Number of rows fetched, after which to start prefetching; MySQL interface +has another parameter */ +#define SEL_PREFETCH_LIMIT 1 + +/* When a select has accessed about this many pages, it returns control back +to que_run_threads: this is to allow canceling runaway queries */ + +#define SEL_COST_LIMIT 100 + +/* Flags for search shortcut */ +#define SEL_FOUND 0 +#define SEL_EXHAUSTED 1 +#define SEL_RETRY 2 + +/************************************************************************* +Creates a select node struct. */ + +sel_node_t* +sel_node_create( +/*============*/ + /* out, own: select node struct */ + mem_heap_t* heap) /* in: memory heap where created */ +{ + sel_node_t* node; + + node = mem_heap_alloc(heap, sizeof(sel_node_t)); + node->common.type = QUE_NODE_SELECT; + node->state = SEL_NODE_OPEN; + + node->select_will_do_update = FALSE; + node->latch_mode = BTR_SEARCH_LEAF; + + node->plans = NULL; + + return(node); +} + +/************************************************************************* +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ + +void +sel_node_free_private( +/*==================*/ + sel_node_t* node) /* in: select node struct */ +{ + ulint i; + plan_t* plan; + + if (node->plans != NULL) { + for (i = 0; i < node->n_tables; i++) { + plan = sel_node_get_nth_plan(node, i); + + btr_pcur_close(&(plan->pcur)); + btr_pcur_close(&(plan->clust_pcur)); + + if (plan->old_vers_heap) { + mem_heap_free(plan->old_vers_heap); + } + } + } +} + +/************************************************************************* +Evaluates the values in a select list. If there are aggregate functions, +their argument value is added to the aggregate total. */ +UNIV_INLINE +void +sel_eval_select_list( +/*=================*/ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + exp = node->select_list; + + while (exp) { + eval_exp(exp); + + exp = que_node_get_next(exp); + } +} + +/************************************************************************* +Assigns the values in the select list to the possible into-variables in +SELECT ... INTO ... */ +UNIV_INLINE +void +sel_assign_into_var_values( +/*=======================*/ + sym_node_t* var, /* in: first variable in a list of variables */ + sel_node_t* node) /* in: select node */ +{ + que_node_t* exp; + + if (var == NULL) { + + return; + } + + exp = node->select_list; + + while (var) { + ut_ad(exp); + + eval_node_copy_val(var->alias, exp); + + exp = que_node_get_next(exp); + var = que_node_get_next(var); + } +} + +/************************************************************************* +Resets the aggregate value totals in the select list of an aggregate type +query. */ +UNIV_INLINE +void +sel_reset_aggregate_vals( +/*=====================*/ + sel_node_t* node) /* in: select node */ +{ + func_node_t* func_node; + + ut_ad(node->is_aggregate); + + func_node = node->select_list; + + while (func_node) { + eval_node_set_int_val(func_node, 0); + + func_node = que_node_get_next(func_node); + } + + node->aggregate_already_fetched = FALSE; +} + +/************************************************************************* +Copies the input variable values when an explicit cursor is opened. */ +UNIV_INLINE +void +row_sel_copy_input_variable_vals( +/*=============================*/ + sel_node_t* node) /* in: select node */ +{ + sym_node_t* var; + + var = UT_LIST_GET_FIRST(node->copy_variables); + + while (var) { + eval_node_copy_val(var, var->alias); + + var->indirection = NULL; + + var = UT_LIST_GET_NEXT(col_var_list, var); + } +} + +/************************************************************************* +Fetches the column values from a record. */ +static +void +row_sel_fetch_columns( +/*==================*/ + dict_index_t* index, /* in: record index */ + rec_t* rec, /* in: record in a clustered or non-clustered + index */ + sym_node_t* column) /* in: first column in a column list, or + NULL */ +{ + dfield_t* val; + ulint index_type; + ulint field_no; + byte* data; + ulint len; + + if (index->type & DICT_CLUSTERED) { + index_type = SYM_CLUST_FIELD_NO; + } else { + index_type = SYM_SEC_FIELD_NO; + } + + while (column) { + field_no = column->field_nos[index_type]; + + if (field_no != ULINT_UNDEFINED) { + + data = rec_get_nth_field(rec, field_no, &len); + + if (column->copy_val) { + eval_node_copy_and_alloc_val(column, data, + len); + } else { + val = que_node_get_val(column); + dfield_set_data(val, data, len); + } + } + + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Allocates a prefetch buffer for a column when prefetch is first time done. */ +static +void +sel_col_prefetch_buf_alloc( +/*=======================*/ + sym_node_t* column) /* in: symbol table node for a column */ +{ + sel_buf_t* sel_buf; + ulint i; + + ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL); + + column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH + * sizeof(sel_buf_t)); + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = column->prefetch_buf + i; + + sel_buf->data = NULL; + + sel_buf->val_buf_size = 0; + } +} + +/************************************************************************* +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ + +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */ +{ + sel_buf_t* sel_buf; + ulint i; + + for (i = 0; i < SEL_MAX_N_PREFETCH; i++) { + sel_buf = prefetch_buf + i; + + if (sel_buf->val_buf_size > 0) { + + mem_free(sel_buf->data); + } + } +} + +/************************************************************************* +Pops the column values for a prefetched, cached row from the column prefetch +buffers and places them to the val fields in the column nodes. */ +static +void +sel_pop_prefetched_row( +/*===================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint val_buf_size; + + ut_ad(plan->n_rows_prefetched > 0); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + val = que_node_get_val(column); + + if (!column->copy_val) { + /* We did not really push any value for the + column */ + + ut_ad(!column->prefetch_buf); + ut_ad(que_node_get_val_buf_size(column) == 0); +#ifdef UNIV_DEBUG + dfield_set_data(val, NULL, 0); +#endif + goto next_col; + } + + ut_ad(column->prefetch_buf); + + sel_buf = column->prefetch_buf + plan->first_prefetched; + + data = sel_buf->data; + len = sel_buf->len; + val_buf_size = sel_buf->val_buf_size; + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + sel_buf->data = dfield_get_data(val); + sel_buf->len = dfield_get_len(val); + sel_buf->val_buf_size = que_node_get_val_buf_size(column); + + dfield_set_data(val, data, len); + que_node_set_val_buf_size(column, val_buf_size); +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } + + plan->n_rows_prefetched--; + + plan->first_prefetched++; +} + +/************************************************************************* +Pushes the column values for a prefetched, cached row to the column prefetch +buffers from the val fields in the column nodes. */ +UNIV_INLINE +void +sel_push_prefetched_row( +/*====================*/ + plan_t* plan) /* in: plan node for a table */ +{ + sym_node_t* column; + sel_buf_t* sel_buf; + dfield_t* val; + byte* data; + ulint len; + ulint pos; + ulint val_buf_size; + + if (plan->n_rows_prefetched == 0) { + pos = 0; + plan->first_prefetched = 0; + } else { + pos = plan->n_rows_prefetched; + + /* We have the convention that pushing new rows starts only + after the prefetch stack has been emptied: */ + + ut_ad(plan->first_prefetched == 0); + } + + plan->n_rows_prefetched++; + + ut_ad(pos < SEL_MAX_N_PREFETCH); + + column = UT_LIST_GET_FIRST(plan->columns); + + while (column) { + if (!column->copy_val) { + /* There is no sense to push pointers to database + page fields when we do not keep latch on the page! */ + + goto next_col; + } + + if (!column->prefetch_buf) { + /* Allocate a new prefetch buffer */ + + sel_col_prefetch_buf_alloc(column); + } + + sel_buf = column->prefetch_buf + pos; + + val = que_node_get_val(column); + + data = dfield_get_data(val); + len = dfield_get_len(val); + val_buf_size = que_node_get_val_buf_size(column); + + /* We must keep track of the allocated memory for + column values to be able to free it later: therefore + we swap the values for sel_buf and val */ + + dfield_set_data(val, sel_buf->data, sel_buf->len); + que_node_set_val_buf_size(column, sel_buf->val_buf_size); + + sel_buf->data = data; + sel_buf->len = len; + sel_buf->val_buf_size = val_buf_size; +next_col: + column = UT_LIST_GET_NEXT(col_var_list, column); + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers( +/*====================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a clustered index */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (plan->old_vers_heap) { + mem_heap_empty(plan->old_vers_heap); + } else { + plan->old_vers_heap = mem_heap_create(512); + } + + err = row_vers_build_for_consistent_read(rec, mtr, plan->index, + read_view, plan->old_vers_heap, + old_vers); + return(err); +} + +/************************************************************************* +Tests the conditions which determine when the index segment we are searching +through has been exhausted. */ +UNIV_INLINE +ibool +row_sel_test_end_conds( +/*===================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved and the right sides of + comparisons evaluated */ +{ + func_node_t* cond; + + /* All conditions in end_conds are comparisons of a column to an + expression */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + /* Evaluate the left side of the comparison, i.e., get the + column value if there is an indirection */ + + eval_sym(cond->args); + + /* Do the comparison */ + + if (!eval_cmp(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Tests the other conditions. */ +UNIV_INLINE +ibool +row_sel_test_other_conds( +/*=====================*/ + /* out: TRUE if row passed the tests */ + plan_t* plan) /* in: plan for the table; the column values must + already have been retrieved */ +{ + func_node_t* cond; + + cond = UT_LIST_GET_FIRST(plan->other_conds); + + while (cond) { + eval_exp(cond); + + if (!eval_node_get_ibool_val(cond)) { + + return(FALSE); + } + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + return(TRUE); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. */ +static +ulint +row_sel_get_clust_rec( +/*==================*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select_node */ + plan_t* plan, /* in: plan node for table */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + + row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec); + + index = dict_table_get_first_index(plan->table); + + btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE, + node->latch_mode, &(plan->clust_pcur), + 0, mtr); + + clust_rec = btr_pcur_get_rec(&(plan->clust_pcur)); + + ut_ad(page_rec_is_user_rec(clust_rec)); + + if (!node->read_view) { + /* Try to place a lock on the index record */ + + err = lock_clust_rec_read_check_and_lock(0, clust_rec, index, + node->row_lock_mode, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (!lock_clust_rec_cons_read_sees(clust_rec, index, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, plan, + clust_rec, &old_vers, mtr); + if (err != DB_SUCCESS) { + + return(err); + } + + clust_rec = old_vers; + + if (clust_rec == NULL) { + *out_rec = clust_rec; + + return(DB_SUCCESS); + } + } + } + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, clust_rec, + UT_LIST_GET_FIRST(plan->columns)); + *out_rec = clust_rec; + + return(DB_SUCCESS); +} + +/************************************************************************* +Sets a lock on a record. */ +UNIV_INLINE +ulint +sel_set_rec_lock( +/*=============*/ + /* out: DB_SUCCESS or error code */ + rec_t* rec, /* in: record */ + dict_index_t* index, /* in: index */ + ulint mode, /* in: lock mode */ + que_thr_t* thr) /* in: query thread */ +{ + ulint err; + + if (index->type & DICT_CLUSTERED) { + err = lock_clust_rec_read_check_and_lock(0, rec, index, mode, + thr); + } else { + err = lock_sec_rec_read_check_and_lock(0, rec, index, mode, + thr); + } + + return(err); +} + +/************************************************************************* +Opens a pcur to a table index. */ +static +void +row_sel_open_pcur( +/*==============*/ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + ibool search_latch_locked, + /* in: TRUE if the thread currently + has the search latch locked in + s-mode */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + func_node_t* cond; + que_node_t* exp; + ulint n_fields; + ulint has_search_latch = 0; /* RW_S_LATCH or 0 */ + ulint i; + + if (search_latch_locked) { + has_search_latch = RW_S_LATCH; + } + + index = plan->index; + + /* Calculate the value of the search tuple: the exact match columns + get their expressions evaluated when we evaluate the right sides of + end_conds */ + + cond = UT_LIST_GET_FIRST(plan->end_conds); + + while (cond) { + eval_exp(que_node_get_next(cond->args)); + + cond = UT_LIST_GET_NEXT(cond_list, cond); + } + + if (plan->tuple) { + n_fields = dtuple_get_n_fields(plan->tuple); + + if (plan->n_exact_match < n_fields) { + /* There is a non-exact match field which must be + evaluated separately */ + + eval_exp(plan->tuple_exps[n_fields - 1]); + } + + for (i = 0; i < n_fields; i++) { + exp = plan->tuple_exps[i]; + + dfield_copy_data(dtuple_get_nth_field(plan->tuple, i), + que_node_get_val(exp)); + } + + /* Open pcur to the index */ + + btr_pcur_open_with_no_init(index, plan->tuple, plan->mode, + node->latch_mode, &(plan->pcur), + has_search_latch, mtr); + } else { + /* Open the cursor to the start or the end of the index + (FALSE: no init) */ + + btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode, + &(plan->pcur), FALSE, mtr); + } + + ut_ad(plan->n_rows_prefetched == 0); + ut_ad(plan->n_rows_fetched == 0); + ut_ad(plan->cursor_at_end == FALSE); + + plan->pcur_is_open = TRUE; +} + +/************************************************************************* +Restores a stored pcur position to a table index. */ +UNIV_INLINE +ibool +row_sel_restore_pcur_pos( +/*=====================*/ + /* out: TRUE if the cursor should be moved to + the next record after we return from this + function (moved to the previous, in the case + of a descending cursor) without processing + again the current cursor record */ + sel_node_t* node, /* in: select node */ + plan_t* plan, /* in: table plan */ + mtr_t* mtr) /* in: mtr */ +{ + ibool equal_position; + ulint relative_position; + + ut_ad(!plan->cursor_at_end); + + relative_position = btr_pcur_get_rel_pos(&(plan->pcur)); + + equal_position = btr_pcur_restore_position(node->latch_mode, + &(plan->pcur), mtr); + + /* If the cursor is traveling upwards, and relative_position is + + (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock + yet on the successor of the page infimum; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + not yet processed the cursor record: no need to move the cursor to the + next record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we must move to the next record; + (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the next + record, else there is no need to move the cursor. */ + + if (plan->asc) { + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER); + + return(FALSE); + } + + /* If the cursor is traveling downwards, and relative_position is + + (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on + the last record LESS than the successor of a page infimum; we have not + processed the cursor record: no need to move the cursor; + (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the + first record GREATER than the predecessor of a page supremum; we have + processed the cursor record: we should move the cursor to the previous + record; + (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the + last record LESS or EQUAL to the old stored user record; (a) if + equal_position is FALSE, this means that the cursor is now on a record + less than the old user record, and we need not move to the previous + record; (b) if equal_position is TRUE, then if + plan->stored_cursor_rec_processed is TRUE, we must move to the previous + record, else there is no need to move the cursor. */ + + if (relative_position == BTR_PCUR_BEFORE) { + + return(FALSE); + } + + if (relative_position == BTR_PCUR_ON) { + + if (equal_position) { + + return(plan->stored_cursor_rec_processed); + } + + return(FALSE); + } + + ut_ad(relative_position == BTR_PCUR_AFTER); + + return(TRUE); +} + +/************************************************************************* +Resets a plan cursor to a closed state. */ +UNIV_INLINE +void +plan_reset_cursor( +/*==============*/ + plan_t* plan) /* in: plan */ +{ + plan->pcur_is_open = FALSE; + plan->cursor_at_end = FALSE; + plan->n_rows_fetched = 0; + plan->n_rows_prefetched = 0; +} + +/************************************************************************* +Tries to do a shortcut to fetch a clustered index record with a unique key, +using the hash index if possible (not always). */ +static +ulint +row_sel_try_search_shortcut( +/*========================*/ + /* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */ + sel_node_t* node, /* in: select node for a consistent read */ + plan_t* plan, /* in: plan for a unique search in clustered + index */ + mtr_t* mtr) /* in: mtr */ +{ + dict_index_t* index; + rec_t* rec; + + index = plan->index; + + ut_ad(node->read_view); + ut_ad(plan->unique_search); + ut_ad(!plan->must_get_clust); + ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED)); + + row_sel_open_pcur(node, plan, TRUE, mtr); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + if (!page_rec_is_user_rec(rec)) { + + return(SEL_RETRY); + } + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search with + the mode PAGE_CUR_GE, the up_match field in the cursor tells how many + fields in the user record matched to the search tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) { + + return(SEL_EXHAUSTED); + } + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (index->type & DICT_CLUSTERED) { + if (!lock_clust_rec_cons_read_sees(rec, index, + node->read_view)) { + return(SEL_RETRY); + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) { + + return(SEL_RETRY); + } + + /* Test deleted flag. Fetch the columns needed in test conditions. */ + + row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + + if (rec_get_deleted_flag(rec)) { + + return(SEL_EXHAUSTED); + } + + /* Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + return(SEL_EXHAUSTED); + } + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + plan->n_rows_fetched++; + + return(SEL_FOUND); +} + +/************************************************************************* +Performs a select step. */ +static +ulint +row_sel( +/*====*/ + /* out: DB_SUCCESS or error code */ + sel_node_t* node, /* in: select node */ + que_thr_t* thr) /* in: query thread */ +{ + dict_index_t* index; + plan_t* plan; + mtr_t mtr; + ibool moved; + rec_t* rec; + rec_t* old_vers; + rec_t* clust_rec; + ibool search_latch_locked; + ibool consistent_read; + + /* The following flag becomes TRUE when we are doing a + consistent read from a non-clustered index and we must look + at the clustered index to find out the previous delete mark + state of the non-clustered record: */ + + ibool cons_read_requires_clust_rec = FALSE; + ulint cost_counter = 0; + ibool cursor_just_opened; + ibool must_go_to_next; + ibool leaf_contains_updates = FALSE; + /* TRUE if select_will_do_update is + TRUE and the current clustered index + leaf page has been updated during + the current mtr: mtr must be committed + at the same time as the leaf x-latch + is released */ + ibool mtr_has_extra_clust_latch = FALSE; + /* TRUE if the search was made using + a non-clustered index, and we had to + access the clustered record: now &mtr + contains a clustered index latch, and + &mtr must be committed before we move + to the next non-clustered record */ + ulint found_flag; + ulint err; + + ut_ad(thr->run_node == node); + + search_latch_locked = FALSE; + + if (node->read_view) { + /* In consistent reads, we try to do with the hash index and + not to use the buffer page get. This is to reduce memory bus + load resulting from semaphore operations. The search latch + will be s-locked when we access an index with a unique search + condition, but not locked when we access an index with a + less selective search condition. */ + + consistent_read = TRUE; + } else { + consistent_read = FALSE; + } + +table_loop: + /* TABLE LOOP + ---------- + This is the outer major loop in calculating a join. We come here when + node->fetch_table changes, and after adding a row to aggregate totals + and, of course, when this function is called. */ + + ut_ad(leaf_contains_updates == FALSE); + ut_ad(mtr_has_extra_clust_latch == FALSE); + + plan = sel_node_get_nth_plan(node, node->fetch_table); + index = plan->index; + + if (plan->n_rows_prefetched > 0) { + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + + if (plan->cursor_at_end) { + /* The cursor has already reached the result set end: no more + rows to process for this table cursor, as also the prefetch + stack was empty */ + + ut_ad(plan->pcur_is_open); + + goto table_exhausted_no_mtr; + } + + /* Open a cursor to index, or restore an open cursor position */ + + mtr_start(&mtr); + + if (consistent_read && plan->unique_search && !plan->pcur_is_open + && !plan->must_get_clust) { + if (!search_latch_locked) { + rw_lock_s_lock(&btr_search_latch); + + search_latch_locked = TRUE; + } else if (btr_search_latch.writer_is_wait_ex) { + + /* There is an x-latch request waiting: release the + s-latch for a moment; as an s-latch here is often + kept for some 10 searches before being released, + a waiting x-latch request would block other threads + from acquiring an s-latch for a long time, lowering + performance significantly in multiprocessors. */ + + rw_lock_s_unlock(&btr_search_latch); + rw_lock_s_lock(&btr_search_latch); + } + + found_flag = row_sel_try_search_shortcut(node, plan, &mtr); + + if (found_flag == SEL_FOUND) { + + goto next_table; + + } else if (found_flag == SEL_EXHAUSTED) { + + goto table_exhausted; + } + + ut_ad(found_flag == SEL_RETRY); + + plan_reset_cursor(plan); + + mtr_commit(&mtr); + mtr_start(&mtr); + } + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + + search_latch_locked = FALSE; + } + + if (!plan->pcur_is_open) { + /* Evaluate the expressions to build the search tuple and + open the cursor */ + + row_sel_open_pcur(node, plan, search_latch_locked, &mtr); + + cursor_just_opened = TRUE; + + /* A new search was made: increment the cost counter */ + cost_counter++; + } else { + /* Restore pcur position to the index */ + + must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr); + + cursor_just_opened = FALSE; + + if (must_go_to_next) { + /* We have already processed the cursor record: move + to the next */ + + goto next_rec; + } + } + +rec_loop: + /* RECORD LOOP + ----------- + In this loop we use pcur and try to fetch a qualifying row, and + also fill the prefetch buffer for this table if n_rows_fetched has + exceeded a threshold. While we are inside this loop, the following + holds: + (1) &mtr is started, + (2) pcur is positioned and open. + + NOTE that if cursor_just_opened is TRUE here, it means that we came + to this point right after row_sel_open_pcur. */ + + ut_ad(mtr_has_extra_clust_latch == FALSE); + + rec = btr_pcur_get_rec(&(plan->pcur)); + + /* PHASE 1: Set a lock if specified */ + + if (!node->asc && cursor_just_opened + && (rec != page_get_supremum_rec(buf_frame_align(rec)))) { + + /* When we open a cursor for a descending search, we must set + a next-key lock on the successor record: otherwise it would + be possible to insert new records next to the cursor position, + and it might be that these new records should appear in the + search result set, resulting in the phantom problem. */ + + if (!consistent_read) { + err = sel_set_rec_lock(page_rec_get_next(rec), index, + node->row_lock_mode, thr); + if (err != DB_SUCCESS) { + /* Note that in this case we will store in pcur + the PREDECESSOR of the record we are waiting + the lock for */ + + goto lock_wait_or_error; + } + } + } + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. We also increment the cost counter as we may have + processed yet another page of index. */ + + cost_counter++; + + goto next_rec; + } + + if (!consistent_read) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (cost_counter > SEL_COST_LIMIT) { + + /* Now that we have placed the necessary locks, we can stop + for a while and store the cursor position; NOTE that if we + would store the cursor position BEFORE placing a record lock, + it might happen that the cursor would jump over some records + that another transaction could meanwhile insert adjacent to + the cursor: this would result in the phantom problem. */ + + goto stop_for_a_while; + } + + /* PHASE 2: Check a mixed index mix id if needed */ + + if (plan->unique_search && cursor_just_opened) { + + ut_ad(plan->mode == PAGE_CUR_GE); + + /* As the cursor is now placed on a user record after a search + with the mode PAGE_CUR_GE, the up_match field in the cursor + tells how many fields in the user record matched to the search + tuple */ + + if (btr_pcur_get_up_match(&(plan->pcur)) + < plan->n_exact_match) { + goto table_exhausted; + } + + /* Ok, no need to test end_conds or mix id */ + + } else if (plan->mixed_index) { + /* We have to check if the record in a mixed cluster belongs + to this table */ + + if (!dict_is_mixed_table_rec(plan->table, rec)) { + + goto next_rec; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* PHASE 3: Get previous version in a consistent read */ + + if (consistent_read) { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + if (index->type & DICT_CLUSTERED) { + + if (!lock_clust_rec_cons_read_sees(rec, index, + node->read_view)) { + + err = row_sel_build_prev_vers(node->read_view, + plan, rec, &old_vers, + &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + row_sel_fetch_columns(index, rec, + UT_LIST_GET_FIRST(plan->columns)); + + if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + node->read_view)) { + cons_read_requires_clust_rec = TRUE; + } + } + + /* PHASE 4: Test search end conditions and deleted flag */ + + /* Fetch the columns needed in test conditions */ + + row_sel_fetch_columns(index, rec, UT_LIST_GET_FIRST(plan->columns)); + + /* Test the selection end conditions: these can only contain columns + which already are found in the index, even though the index might be + non-clustered */ + + if (plan->unique_search && cursor_just_opened) { + + /* No test necessary: the test was already made above */ + + } else if (!row_sel_test_end_conds(plan)) { + + goto table_exhausted; + } + + if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 5: Get the clustered index record, if needed and if we did + not do the search using the clustered index */ + + if (plan->must_get_clust || cons_read_requires_clust_rec) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec, + &mtr); + mtr_has_extra_clust_latch = TRUE; + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + /* Retrieving the clustered record required a search: + increment the cost counter */ + + cost_counter++; + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(consistent_read); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + if (node->can_get_updated) { + + btr_pcur_store_position(&(plan->clust_pcur), &mtr); + } + } + + /* PHASE 6: Test the rest of search conditions */ + + if (!row_sel_test_other_conds(plan)) { + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + /* PHASE 7: We found a new qualifying row for the current table; push + the row if prefetch is on, or move to the next table in the join */ + + plan->n_rows_fetched++; + + ut_ad(plan->pcur.latch_mode == node->latch_mode); + + if (node->select_will_do_update) { + /* This is a searched update and we can do the update in-place, + saving CPU time */ + + row_upd_in_place_in_select(node, thr, &mtr); + + leaf_contains_updates = TRUE; + + /* When the database is in the online backup mode, the number + of log records for a single mtr should be small: increment the + cost counter to ensure it */ + + cost_counter += 1 + (SEL_COST_LIMIT / 8); + + if (plan->unique_search) { + + goto table_exhausted; + } + + goto next_rec; + } + + if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT) + || plan->unique_search || plan->no_prefetch) { + + /* No prefetch in operation: go to the next table */ + + goto next_table; + } + + sel_push_prefetched_row(plan); + + if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) { + + /* The prefetch buffer is now full */ + + sel_pop_prefetched_row(plan); + + goto next_table; + } + +next_rec: + ut_ad(!search_latch_locked); + + if (mtr_has_extra_clust_latch) { + + /* We must commit &mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + goto commit_mtr_for_a_while; + } + + if (leaf_contains_updates + && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) { + + /* We must commit &mtr if we are moving to a different page, + because we have done updates to the x-latched leaf page, and + the latch would be released in btr_pcur_move_to_next, without + &mtr getting committed there */ + + ut_ad(node->asc); + + goto commit_mtr_for_a_while; + } + + if (node->asc) { + moved = btr_pcur_move_to_next(&(plan->pcur), &mtr); + } else { + moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr); + } + + if (!moved) { + + goto table_exhausted; + } + + cursor_just_opened = FALSE; + + /* END OF RECORD LOOP + ------------------ */ + goto rec_loop; + +next_table: + /* We found a record which satisfies the conditions: we can move to + the next table or return a row in the result set */ + + ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr)); + + if (plan->unique_search && !node->can_get_updated) { + + plan->cursor_at_end = TRUE; + } else { + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = TRUE; + + btr_pcur_store_position(&(plan->pcur), &mtr); + } + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + +next_table_no_mtr: + /* If we use 'goto' to this label, it means that the row was popped + from the prefetched rows stack, and &mtr is already committed */ + + if (node->fetch_table + 1 == node->n_tables) { + + sel_eval_select_list(node); + + if (node->is_aggregate) { + + goto table_loop; + } + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->fetch_table++; + + /* When we move to the next table, we first reset the plan cursor: + we do not care about resetting it when we backtrack from a table */ + + plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table)); + + goto table_loop; + +table_exhausted: + /* The table cursor pcur reached the result set end: backtrack to the + previous table in the join if we do not have cached prefetched rows */ + + plan->cursor_at_end = TRUE; + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + if (plan->n_rows_prefetched > 0) { + /* The table became exhausted during a prefetch */ + + sel_pop_prefetched_row(plan); + + goto next_table_no_mtr; + } + +table_exhausted_no_mtr: + if (node->fetch_table == 0) { + + if (node->is_aggregate && !node->aggregate_already_fetched) { + + node->aggregate_already_fetched = TRUE; + + sel_assign_into_var_values(node->into_list, node); + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->state = SEL_NODE_NO_MORE_ROWS; + + thr->run_node = que_node_get_parent(node); + + if (search_latch_locked) { + rw_lock_s_unlock(&btr_search_latch); + } + + return(DB_SUCCESS); + } + + node->fetch_table--; + + goto table_loop; + +stop_for_a_while: + /* Return control for a while to que_run_threads, so that runaway + queries can be canceled. NOTE that when we come here, we must, in a + locking read, have placed the necessary (possibly waiting request) + record lock on the cursor record or its successor: when we reposition + the cursor, this record lock guarantees that nobody can meanwhile have + inserted new records which should have appeared in the result set, + which would result in the phantom problem. */ + + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + return(DB_SUCCESS); + +commit_mtr_for_a_while: + /* Stores the cursor position and commits &mtr; this is used if + &mtr may contain latches which would break the latching order if + &mtr would not be committed and the latches released. */ + + plan->stored_cursor_rec_processed = TRUE; + + ut_ad(!search_latch_locked); + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + leaf_contains_updates = FALSE; + mtr_has_extra_clust_latch = FALSE; + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + goto table_loop; + +lock_wait_or_error: + /* See the note at stop_for_a_while: the same holds for this case */ + + ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr) + || !node->asc); + ut_ad(!search_latch_locked); + + plan->stored_cursor_rec_processed = FALSE; + btr_pcur_store_position(&(plan->pcur), &mtr); + + mtr_commit(&mtr); + + ut_ad(sync_thread_levels_empty_gen(TRUE)); + + return(err); +} + +/************************************************************************** +Performs a select step. This is a high-level function used in SQL execution +graphs. */ + +que_thr_t* +row_sel_step( +/*=========*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + ulint i_lock_mode; + sym_node_t* table_node; + sel_node_t* node; + ulint err; + + ut_ad(thr); + + node = thr->run_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_SELECT); + + /* If this is a new time this node is executed (or when execution + resumes after wait for a table intention lock), set intention locks + on the tables, or assign a read view */ + + if (node->into_list && (thr->prev_node == que_node_get_parent(node))) { + + node->state = SEL_NODE_OPEN; + } + + if (node->state == SEL_NODE_OPEN) { + + /* It may be that the current session has not yet started + its transaction, or it has been committed: */ + + trx_start_if_not_started(thr_get_trx(thr)); + + plan_reset_cursor(sel_node_get_nth_plan(node, 0)); + + if (node->consistent_read) { + /* Assign a read view for the query */ + node->read_view = trx_assign_read_view( + thr_get_trx(thr)); + } else { + if (node->set_x_locks) { + i_lock_mode = LOCK_IX; + } else { + i_lock_mode = LOCK_IS; + } + + table_node = node->table_list; + + while (table_node) { + err = lock_table(0, table_node->table, + i_lock_mode, thr); + if (err != DB_SUCCESS) { + + que_thr_handle_error(thr, DB_ERROR, + NULL, 0); + return(NULL); + } + + table_node = que_node_get_next(table_node); + } + } + + /* If this is an explicit cursor, copy stored procedure + variable values, so that the values cannot change between + fetches (currently, we copy them also for non-explicit + cursors) */ + + if (node->explicit_cursor && + UT_LIST_GET_FIRST(node->copy_variables)) { + + row_sel_copy_input_variable_vals(node); + } + + node->state = SEL_NODE_FETCH; + node->fetch_table = 0; + + if (node->is_aggregate) { + /* Reset the aggregate total values */ + sel_reset_aggregate_vals(node); + } + } + + err = row_sel(node, thr); + + /* NOTE! if queries are parallelized, the following assignment may + have problems; the assignment should be made only if thr is the + only top-level thr in the graph: */ + + thr->graph->last_sel_node = node; + + if (err == DB_SUCCESS) { + /* Ok: do nothing */ + + } else if (err == DB_LOCK_WAIT) { + + return(NULL); + } else { + /* SQL error detected */ + printf("SQL error %lu\n", err); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + return(thr); +} + +/************************************************************************** +Performs a fetch for a cursor. */ + +que_thr_t* +fetch_step( +/*=======*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + sel_node_t* sel_node; + fetch_node_t* node; + + ut_ad(thr); + + node = thr->run_node; + sel_node = node->cursor_def; + + ut_ad(que_node_get_type(node) == QUE_NODE_FETCH); + + if (thr->prev_node != que_node_get_parent(node)) { + + if (sel_node->state != SEL_NODE_NO_MORE_ROWS) { + + sel_assign_into_var_values(node->into_list, sel_node); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + /* Make the fetch node the parent of the cursor definition for + the time of the fetch, so that execution knows to return to this + fetch node after a row has been selected or we know that there is + no row left */ + + sel_node->common.parent = node; + + if (sel_node->state == SEL_NODE_CLOSED) { + /* SQL error detected */ + printf("SQL error %lu\n", DB_ERROR); + + que_thr_handle_error(thr, DB_ERROR, NULL, 0); + + return(NULL); + } + + thr->run_node = sel_node; + + return(thr); +} + +/*************************************************************** +Prints a row in a select result. */ + +que_thr_t* +row_printf_step( +/*============*/ + /* out: query thread to run next or NULL */ + que_thr_t* thr) /* in: query thread */ +{ + row_printf_node_t* node; + sel_node_t* sel_node; + que_node_t* arg; + + ut_ad(thr); + + node = thr->run_node; + + sel_node = node->sel_node; + + ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Reset the cursor */ + sel_node->state = SEL_NODE_OPEN; + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); + } + + if (sel_node->state != SEL_NODE_FETCH) { + + ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS); + + /* No more rows to print */ + + thr->run_node = que_node_get_parent(node); + + return(thr); + } + + arg = sel_node->select_list; + + while (arg) { + dfield_print_also_hex(que_node_get_val(arg)); + + printf(" ::: "); + + arg = que_node_get_next(arg); + } + + printf("\n"); + + /* Fetch next row to print */ + + thr->run_node = sel_node; + + return(thr); +} + +/******************************************************************** +Converts a key value stored in MySQL format to an Innobase dtuple. +The last field of the key value may be just a prefix of a fixed length +field: hence the parameter key_len. */ + +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /* in: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /* in: buffer to use in field + conversions */ + dict_index_t* index, /* in: index of the key value */ + byte* key_ptr, /* in: MySQL key value */ + ulint key_len) /* in: MySQL key value length */ +{ + dfield_t* dfield; + ulint offset; + ulint len; + byte* key_end; + ulint n_fields = 0; + + key_end = key_ptr + key_len; + + /* Permit us to access any field in the tuple (ULINT_MAX): */ + + dtuple_set_n_fields(tuple, ULINT_MAX); + + dfield = dtuple_get_nth_field(tuple, 0); + + if (dfield_get_type(dfield)->mtype == DATA_SYS) { + /* A special case: we are looking for a position in a + generated clustered index: the first and the only + ordering column is ROW_ID */ + + ut_a(key_len == DATA_ROW_ID_LEN); + + dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN); + + dtuple_set_n_fields(tuple, 1); + + return; + } + + while (key_ptr < key_end) { + offset = 0; + len = dfield_get_type(dfield)->len; + + n_fields++; + + if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) { + /* The first byte in the field tells if this is + an SQL NULL value */ + + offset = 1; + + if (*key_ptr != 0) { + dfield_set_data(dfield, NULL, UNIV_SQL_NULL); + + goto next_part; + } + } + + row_mysql_store_col_in_innobase_format( + dfield, buf, key_ptr + offset, len, + dfield_get_type(dfield)->mtype, + dfield_get_type(dfield)->prtype + & DATA_UNSIGNED); + next_part: + key_ptr += (offset + len); + + if (key_ptr > key_end) { + /* The last field in key was not a complete + field but a prefix of it */ + + ut_ad(dfield_get_len(dfield) != UNIV_SQL_NULL); + + dfield_set_data(dfield, buf, + len - (ulint)(key_ptr - key_end)); + } + + buf += len; + + dfield++; + } + + /* We set the length of tuple to n_fields: we assume that + the memory area allocated for it is big enough (usually + bigger than n_fields). */ + + dtuple_set_n_fields(tuple, n_fields); +} + +/****************************************************************** +Stores the row id to the prebuilt struct. */ +UNIV_INLINE +void +row_sel_store_row_id_to_prebuilt( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt */ + rec_t* index_rec, /* in: record */ + dict_index_t* index) /* in: index of the record */ +{ + byte* data; + ulint len; + + data = rec_get_nth_field(index_rec, + dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len); + + ut_a(len == DATA_ROW_ID_LEN); + + ut_memcpy(prebuilt->row_id, data, len); +} + +/****************************************************************** +Stores a non-SQL-NULL field in the MySQL format. */ +UNIV_INLINE +void +row_sel_field_store_in_mysql_format( +/*================================*/ + byte* dest, /* in/out: buffer where to store; NOTE that BLOBs + are not in themselves stored here: the caller must + allocate and copy the BLOB into buffer before, and pass + the pointer to the BLOB in 'data' */ + ulint col_len,/* in: MySQL column length */ + byte* data, /* in: data to store */ + ulint len, /* in: length of the data */ + ulint type, /* in: data type */ + ulint is_unsigned)/* in: != 0 if an unsigned integer type */ +{ + byte* ptr; + + ut_ad(len != UNIV_SQL_NULL); + + if (type == DATA_INT) { + /* Convert integer data from Innobase to a little-endian + format, sign bit restored to normal */ + + ptr = dest + len; + + for (;;) { + ptr--; + *ptr = *data; + if (ptr == dest) { + break; + } + data++; + } + + if (!is_unsigned) { + dest[len - 1] = dest[len - 1] ^ 128; + } + + ut_ad(col_len == len); + } else if (type == DATA_VARCHAR || type == DATA_VARMYSQL + || type == DATA_BINARY) { + /* Store the length of the data to the first two bytes of + dest; does not do anything yet because MySQL has + no real vars! */ + + dest = row_mysql_store_var_len(dest, len); + ut_memcpy(dest, data, len); + + /* ut_ad(col_len >= len + 2); No real var implemented in + MySQL yet! */ + + } else if (type == DATA_BLOB) { + /* Store a pointer to the BLOB buffer to dest: the BLOB was + already copied to the buffer in row_sel_store_mysql_rec */ + + row_mysql_store_blob_ref(dest, col_len, data, len); + } else { + ut_memcpy(dest, data, len); + ut_ad(col_len == len); + } +} + +/****************************************************************** +Convert a row in the Innobase format to a row in the MySQL format. +Note that the template in prebuilt may advise us to copy only a few +columns to mysql_rec, other columns are left blank. All columns may not +be needed in the query. */ +static +void +row_sel_store_mysql_rec( +/*====================*/ + byte* mysql_rec, /* out: row in the MySQL format */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec) /* in: Innobase record in the index + which was described in prebuilt's + template */ +{ + mysql_row_templ_t* templ; + byte* data; + ulint len; + byte* blob_buf; + ulint i; + + ut_ad(prebuilt->mysql_template); + + if (prebuilt->blob_heap != NULL) { + mem_heap_free(prebuilt->blob_heap); + prebuilt->blob_heap = NULL; + } + + /* Mark all columns as not SQL NULL */ + + memset(mysql_rec, '\0', prebuilt->null_bitmap_len); + + for (i = 0; i < prebuilt->n_template; i++) { + + templ = prebuilt->mysql_template + i; + + data = rec_get_nth_field(rec, templ->rec_field_no, &len); + + if (len != UNIV_SQL_NULL) { + if (templ->type == DATA_BLOB) { + + /* Copy the BLOB data to the BLOB + heap of prebuilt */ + + if (prebuilt->blob_heap == NULL) { + prebuilt->blob_heap = + mem_heap_create(len); + } + + blob_buf = mem_heap_alloc(prebuilt->blob_heap, + len); + ut_memcpy(blob_buf, data, len); + + data = blob_buf; + } + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ->mysql_col_len, data, len, + templ->type, templ->is_unsigned); + } else { + mysql_rec[templ->mysql_null_byte_offset] |= + (byte) (templ->mysql_null_bit_mask); + } + } +} + +/************************************************************************* +Builds a previous version of a clustered index record for a consistent read */ +static +ulint +row_sel_build_prev_vers_for_mysql( +/*==============================*/ + /* out: DB_SUCCESS or error code */ + read_view_t* read_view, /* in: read view */ + dict_index_t* clust_index, /* in: clustered index */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec, /* in: record in a clustered index */ + rec_t** old_vers, /* out: old version, or NULL if the + record does not exist in the view: + i.e., it was freshly inserted + afterwards */ + mtr_t* mtr) /* in: mtr */ +{ + ulint err; + + if (prebuilt->old_vers_heap) { + mem_heap_empty(prebuilt->old_vers_heap); + } else { + prebuilt->old_vers_heap = mem_heap_create(200); + } + + err = row_vers_build_for_consistent_read(rec, mtr, clust_index, + read_view, prebuilt->old_vers_heap, + old_vers); + return(err); +} + +/************************************************************************* +Retrieves the clustered index record corresponding to a record in a +non-clustered index. Does the necessary locking. Used in the MySQL +interface. */ +static +ulint +row_sel_get_clust_rec_for_mysql( +/*============================*/ + /* out: DB_SUCCESS or error code */ + row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */ + dict_index_t* sec_index,/* in: secondary index where rec resides */ + rec_t* rec, /* in: record in a non-clustered index */ + que_thr_t* thr, /* in: query thread */ + rec_t** out_rec,/* out: clustered record or an old version of + it, NULL if the old version did not exist + in the read view, i.e., it was a fresh + inserted version */ + mtr_t* mtr) /* in: mtr used to get access to the + non-clustered record; the same mtr is used to + access the clustered index */ +{ + dict_index_t* clust_index; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + trx_t* trx; + + *out_rec = NULL; + + row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec); + + clust_index = dict_table_get_first_index(sec_index->table); + + btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref, + PAGE_CUR_LE, BTR_SEARCH_LEAF, + prebuilt->clust_pcur, 0, mtr); + + clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur); + + ut_ad(page_rec_is_user_rec(clust_rec)); + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record */ + + err = lock_clust_rec_read_check_and_lock(0, clust_rec, + clust_index, + prebuilt->select_lock_type, thr); + if (err != DB_SUCCESS) { + + return(err); + } + } else { + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + trx = thr_get_trx(thr); + + if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index, + trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, clust_rec, + &old_vers, mtr); + + if (err != DB_SUCCESS) { + + return(err); + } + + clust_rec = old_vers; + } + } + + *out_rec = clust_rec; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We may use the cursor in update: store its position */ + + btr_pcur_store_position(prebuilt->clust_pcur, mtr); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +Restores cursor position after it has been stored. We have to take into +account that the record cursor was positioned on can have been deleted. +Then we may have to move the cursor one step up or down. */ +static +ibool +sel_restore_position_for_mysql( +/*===========================*/ + /* out: TRUE if we may need to + process the record the cursor is + now positioned on (i.e. we should + not go to the next record yet) */ + ulint latch_mode, /* in: latch mode wished in + restoration */ + btr_pcur_t* pcur, /* in: cursor whose position + has been stored */ + ibool moves_up, /* in: TRUE if the cursor moves up + in the index */ + mtr_t* mtr) /* in: mtr; CAUTION: may commit + mtr temporarily! */ +{ + ibool success; + ulint relative_position; + + relative_position = pcur->rel_pos; + + success = btr_pcur_restore_position(latch_mode, pcur, mtr); + + if (relative_position == BTR_PCUR_ON) { + if (success) { + return(FALSE); + } + + if (moves_up) { + btr_pcur_move_to_next(pcur, mtr); + + return(TRUE); + } + + return(TRUE); + } + + if (relative_position == BTR_PCUR_AFTER) { + if (moves_up) { + return(TRUE); + } + + if (btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_prev(pcur, mtr); + } + + return(TRUE); + } + + ut_ad(relative_position == BTR_PCUR_BEFORE); + + if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) { + btr_pcur_move_to_next(pcur, mtr); + } + + return(TRUE); +} + +/************************************************************************ +Pops a cached row for MySQL from the fetch cache. */ +UNIV_INLINE +void +row_sel_pop_cached_row_for_mysql( +/*=============================*/ + byte* buf, /* in/out: buffer where to copy the + row */ + row_prebuilt_t* prebuilt) /* in: prebuilt struct */ +{ + ut_ad(prebuilt->n_fetch_cached > 0); + + ut_memcpy(buf, prebuilt->fetch_cache[prebuilt->fetch_cache_first], + prebuilt->mysql_row_len); + prebuilt->n_fetch_cached--; + prebuilt->fetch_cache_first++; + + if (prebuilt->n_fetch_cached == 0) { + prebuilt->fetch_cache_first = 0; + } +} + +/************************************************************************ +Pushes a row for MySQL to the fetch cache. */ +UNIV_INLINE +void +row_sel_push_cache_row_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt, /* in: prebuilt struct */ + rec_t* rec) /* in: record to push */ +{ + ulint i; + + ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE); + + if (prebuilt->fetch_cache[0] == NULL) { + /* Allocate memory for the fetch cache */ + + for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) { + prebuilt->fetch_cache[i] = mem_alloc( + prebuilt->mysql_row_len); + } + } + + ut_ad(prebuilt->fetch_cache_first == 0); + + row_sel_store_mysql_rec( + prebuilt->fetch_cache[prebuilt->n_fetch_cached], + prebuilt, rec); + + prebuilt->n_fetch_cached++; +} + +/************************************************************************ +Searches for rows in the database. This is used in the interface to +MySQL. This function opens a cursor, and also implements fetch next +and fetch prev. NOTE that if we do a search with a full key value +from a unique index (ROW_SEL_EXACT), then we will not store the cursor +position and fetch next or fetch prev must not be tried to the cursor! */ + +ulint +row_search_for_mysql( +/*=================*/ + /* out: DB_SUCCESS, + DB_RECORD_NOT_FOUND, + DB_END_OF_INDEX, or DB_DEADLOCK */ + byte* buf, /* in/out: buffer for the fetched + row in the MySQL format */ + ulint mode, /* in: search mode PAGE_CUR_L, ... */ + row_prebuilt_t* prebuilt, /* in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint match_mode, /* in: 0 or ROW_SEL_EXACT or + ROW_SEL_EXACT_PREFIX */ + ulint direction) /* in: 0 or ROW_SEL_NEXT or + ROW_SEL_PREV; NOTE: if this is != 0, + then prebuilt must have a pcur + with stored position! In opening of a + cursor 'direction' should be 0. */ +{ + dict_index_t* index = prebuilt->index; + dtuple_t* search_tuple = prebuilt->search_tuple; + btr_pcur_t* pcur = prebuilt->pcur; + trx_t* trx = prebuilt->trx; + dict_index_t* clust_index; + que_thr_t* thr; + rec_t* rec; + rec_t* index_rec; + rec_t* clust_rec; + rec_t* old_vers; + ulint err; + ibool moved; + ibool cons_read_requires_clust_rec; + ibool was_lock_wait; + ulint ret; + ibool unique_search_from_clust_index = FALSE; + ibool mtr_has_extra_clust_latch = FALSE; + ibool moves_up = FALSE; + mtr_t mtr; + + ut_ad(index && pcur && search_tuple); + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + ut_ad(sync_thread_levels_empty_gen(FALSE)); + + if (direction == 0) { + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + if (prebuilt->sel_graph == NULL) { + /* Build a dummy select query graph */ + row_prebuild_sel_graph(prebuilt); + } + } else { + if (prebuilt->n_rows_fetched == 0) { + prebuilt->fetch_direction = direction; + } + + if (direction != prebuilt->fetch_direction) { + if (prebuilt->n_fetch_cached > 0) { + ut_a(0); + /* TODO: scrollable cursor: restore cursor to + the place of the latest returned row, + or better: prevent caching for a scroll + cursor! */ + } + + prebuilt->n_rows_fetched = 0; + prebuilt->n_fetch_cached = 0; + prebuilt->fetch_cache_first = 0; + + } else if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + prebuilt->n_rows_fetched++; + + return(DB_SUCCESS); + } + + if (prebuilt->fetch_cache_first > 0 + && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) { + + /* The previous returned row was popped from the fetch + cache, but the cache was not full at the time of the + popping: no more rows can exist in the result set */ + + return(DB_RECORD_NOT_FOUND); + } + + prebuilt->n_rows_fetched++; + + if (prebuilt->n_rows_fetched > 1000000000) { + /* Prevent wrap-over */ + prebuilt->n_rows_fetched = 500000000; + } + + mode = pcur->search_mode; + } + + if (match_mode == ROW_SEL_EXACT && index->type & DICT_UNIQUE + && index->type & DICT_CLUSTERED + && dtuple_get_n_fields(search_tuple) + == dict_index_get_n_unique(index)) { + + if (direction == ROW_SEL_NEXT) { + /* MySQL sometimes seems to do fetch next even + if the search condition is unique; we do not store + pcur position in this case, so we cannot + restore cursor position, and must return + immediately */ + + return(DB_RECORD_NOT_FOUND); + } + + ut_a(direction == 0); /* We cannot do fetch prev, as we have + not stored the cursor position */ + mode = PAGE_CUR_GE; + + unique_search_from_clust_index = TRUE; + } + + /* Note that if the search mode was GE or G, then the cursor + naturally moves upward (in fetch next) in alphabetical order, + otherwise downward */ + + if (direction == 0) { + if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) { + moves_up = TRUE; + } + } else if (direction == ROW_SEL_NEXT) { + moves_up = TRUE; + } + + mtr_start(&mtr); + + thr = que_fork_get_first_thr(prebuilt->sel_graph); + + que_thr_move_to_run_state_for_mysql(thr, trx); + + clust_index = dict_table_get_first_index(index->table); + + if (direction != 0) { + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (!moved) { + goto next_rec; + } + + } else if (dtuple_get_n_fields(search_tuple) > 0) { + + btr_pcur_open_with_no_init(index, search_tuple, mode, + BTR_SEARCH_LEAF, + pcur, 0, &mtr); + } else { + if (mode == PAGE_CUR_G) { + btr_pcur_open_at_index_side(TRUE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } else if (mode == PAGE_CUR_L) { + btr_pcur_open_at_index_side(FALSE, index, + BTR_SEARCH_LEAF, pcur, FALSE, &mtr); + } + } + + if (!prebuilt->sql_stat_start) { + /* No need to set an intention lock or assign a read view */ + + } else if (prebuilt->select_lock_type == LOCK_NONE) { + /* This is a consistent read */ + trx_start_if_not_started(trx); + + /* Assign a read view for the query */ + + trx_assign_read_view(trx); + prebuilt->sql_stat_start = FALSE; + } else { + trx_start_if_not_started(trx); + + if (prebuilt->select_lock_type == LOCK_S) { + err = lock_table(0, index->table, LOCK_IS, thr); + } else { + err = lock_table(0, index->table, LOCK_IX, thr); + } + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + prebuilt->sql_stat_start = FALSE; + } + + /*-------------------------------------------------------------*/ +rec_loop: + cons_read_requires_clust_rec = FALSE; + + rec = btr_pcur_get_rec(pcur); + + if (rec == page_get_infimum_rec(buf_frame_align(rec))) { + + /* The infimum record on a page cannot be in the result set, + and neither can a record lock be placed on it: we skip such + a record. */ + + goto next_rec; + } + + if (prebuilt->select_lock_type != LOCK_NONE) { + /* Try to place a lock on the index record */ + + err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type, + thr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + } + + if (rec == page_get_supremum_rec(buf_frame_align(rec))) { + + /* A page supremum record cannot be in the result set: skip + it now when we have placed a possible lock on it */ + + goto next_rec; + } + + ut_ad(page_rec_is_user_rec(rec)); + + if (unique_search_from_clust_index && btr_pcur_get_up_match(pcur) + == dtuple_get_n_fields(search_tuple)) { + /* The record matches enough */ + + ut_ad(mode == PAGE_CUR_GE); + + } else if (match_mode == ROW_SEL_EXACT) { + /* Test if the index record matches completely to search_tuple + in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */ + + if (0 != cmp_dtuple_rec(search_tuple, rec)) { + + btr_pcur_store_position(pcur, &mtr); + + ret = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + + } else if (match_mode == ROW_SEL_EXACT_PREFIX) { + + if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) { + + btr_pcur_store_position(pcur, &mtr); + + ret = DB_RECORD_NOT_FOUND; + + goto normal_return; + } + } + + /* We are ready to look at a possible new index entry in the result + set: the cursor is now placed on a user record */ + + /* Get the right version of the row in a consistent read */ + + if (prebuilt->select_lock_type == LOCK_NONE) { + + /* This is a non-locking consistent read: if necessary, fetch + a previous version of the record */ + + cons_read_requires_clust_rec = FALSE; + + if (index == clust_index) { + + if (!lock_clust_rec_cons_read_sees(rec, index, + trx->read_view)) { + + err = row_sel_build_prev_vers_for_mysql( + trx->read_view, clust_index, + prebuilt, rec, + &old_vers, &mtr); + + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (old_vers == NULL) { + /* The row did not exist yet in + the read view */ + + goto next_rec; + } + + rec = old_vers; + } + } else if (!lock_sec_rec_cons_read_sees(rec, index, + trx->read_view)) { + /* We are looking into a non-clustered index, + and to get the right version of the record we + have to look also into the clustered index: this + is necessary, because we can only get the undo + information via the clustered index record. */ + + cons_read_requires_clust_rec = TRUE; + } + } + + if (rec_get_deleted_flag(rec) && !cons_read_requires_clust_rec) { + + /* The record is delete marked: we can skip it if this is + not a consistent read which might see an earlier version + of a non-clustered index record */ + + goto next_rec; + } + + /* Get the clustered index record if needed and if we did + not do the search using the clustered index */ + + index_rec = rec; + + if (index != clust_index && (cons_read_requires_clust_rec + || prebuilt->need_to_access_clustered)) { + + /* It was a non-clustered index and we must fetch also the + clustered index record */ + + mtr_has_extra_clust_latch = TRUE; + + err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec, + thr, &clust_rec, &mtr); + if (err != DB_SUCCESS) { + + goto lock_wait_or_error; + } + + if (clust_rec == NULL) { + /* The record did not exist in the read view */ + ut_ad(prebuilt->select_lock_type == LOCK_NONE); + + goto next_rec; + } + + if (rec_get_deleted_flag(clust_rec)) { + + /* The record is delete marked: we can skip it */ + + goto next_rec; + } + + rec = clust_rec; + } + + /* We found a qualifying row */ + + if (prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD + && !prebuilt->templ_contains_blob + && prebuilt->select_lock_type == LOCK_NONE + && !prebuilt->clust_index_was_generated) { + + /* Inside an update, for example, we do not cache rows, + since we may use the cursor position to do the actual + update, that is why we require ...lock_type == LOCK_NONE */ + + row_sel_push_cache_row_for_mysql(prebuilt, rec); + + if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) { + + goto got_row; + } + + goto next_rec; + } else { + row_sel_store_mysql_rec(buf, prebuilt, rec); + + if (prebuilt->clust_index_was_generated) { + row_sel_store_row_id_to_prebuilt(prebuilt, index_rec, + index); + } + } +got_row: + /* TODO: should we in every case store the cursor position, even + if this is just a join, for example? */ + + if (!unique_search_from_clust_index + || prebuilt->select_lock_type == LOCK_X) { + + /* Inside an update always store the cursor position */ + + btr_pcur_store_position(pcur, &mtr); + } + + ret = DB_SUCCESS; + + goto normal_return; + /*-------------------------------------------------------------*/ +next_rec: + if (mtr_has_extra_clust_latch) { + /* We must commit mtr if we are moving to the next + non-clustered index record, because we could break the + latching order if we would access a different clustered + index page right away without releasing the previous. */ + + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + mtr_start(&mtr); + moved = sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + if (moved) { + goto rec_loop; + } + } + + if (moves_up) { + moved = btr_pcur_move_to_next(pcur, &mtr); + } else { + moved = btr_pcur_move_to_prev(pcur, &mtr); + } + + if (!moved) { + btr_pcur_store_position(pcur, &mtr); + + if (match_mode != 0) { + ret = DB_RECORD_NOT_FOUND; + } else { + ret = DB_END_OF_INDEX; + } + + goto normal_return; + } + + goto rec_loop; + /*-------------------------------------------------------------*/ +lock_wait_or_error: + btr_pcur_store_position(pcur, &mtr); + + mtr_commit(&mtr); + mtr_has_extra_clust_latch = FALSE; + + trx->error_state = err; + + /* The following is a patch for MySQL */ + + que_thr_stop_for_mysql(thr); + + was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL); + + if (was_lock_wait) { + mtr_start(&mtr); + + sel_restore_position_for_mysql(BTR_SEARCH_LEAF, pcur, + moves_up, &mtr); + mode = pcur->search_mode; + + goto rec_loop; + } + + return(err); + +normal_return: + que_thr_stop_for_mysql_no_error(thr, trx); + + mtr_commit(&mtr); + + if (prebuilt->n_fetch_cached > 0) { + row_sel_pop_cached_row_for_mysql(buf, prebuilt); + + ret = DB_SUCCESS; + } + + return(ret); +} |