summaryrefslogtreecommitdiff
path: root/storage/innobase/row/row0merge.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/row/row0merge.c')
-rw-r--r--storage/innobase/row/row0merge.c250
1 files changed, 173 insertions, 77 deletions
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c
index 232211e5ce7..d9084bb4ffd 100644
--- a/storage/innobase/row/row0merge.c
+++ b/storage/innobase/row/row0merge.c
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -57,6 +57,11 @@ Completed by Sunny Bains and Marko Makela
#include "ut0sort.h"
#include "handler0alter.h"
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined __WIN__
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* __WIN__ */
+
#ifdef UNIV_DEBUG
/** Set these in order ot enable debug printout. */
/* @{ */
@@ -269,6 +274,7 @@ row_merge_buf_add(
const dict_index_t* index;
dfield_t* entry;
dfield_t* field;
+ const dict_field_t* ifield;
if (buf->n_tuples >= buf->max_tuples) {
return(FALSE);
@@ -287,14 +293,14 @@ row_merge_buf_add(
data_size = 0;
extra_size = UT_BITS_IN_BYTES(index->n_nullable);
- for (i = 0; i < n_fields; i++, field++) {
- const dict_field_t* ifield;
+ ifield = dict_index_get_nth_field(index, 0);
+
+ for (i = 0; i < n_fields; i++, field++, ifield++) {
const dict_col_t* col;
ulint col_no;
const dfield_t* row_field;
ulint len;
- ifield = dict_index_get_nth_field(index, i);
col = ifield->col;
col_no = dict_col_get_no(col);
row_field = dtuple_get_nth_field(row, col_no);
@@ -424,14 +430,13 @@ row_merge_dup_report(
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
const dfield_t* entry) /*!< in: duplicate index entry */
{
- mrec_buf_t buf;
+ mrec_buf_t* buf;
const dtuple_t* tuple;
dtuple_t tuple_store;
const rec_t* rec;
const dict_index_t* index = dup->index;
ulint n_fields= dict_index_get_n_fields(index);
- mem_heap_t* heap = NULL;
- ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap;
ulint* offsets;
ulint n_ext;
@@ -441,22 +446,22 @@ row_merge_dup_report(
return;
}
- rec_offs_init(offsets_);
-
/* Convert the tuple to a record and then to MySQL format. */
+ heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+ * sizeof *offsets
+ + sizeof *buf);
+
+ buf = mem_heap_alloc(heap, sizeof *buf);
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
- rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
- offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
- &heap);
+ rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
innobase_rec_to_mysql(dup->table, rec, index, offsets);
- if (UNIV_LIKELY_NULL(heap)) {
- mem_heap_free(heap);
- }
+ mem_heap_free(heap);
}
/*************************************************************//**
@@ -627,22 +632,26 @@ row_merge_buf_write(
}
/******************************************************//**
-Create a memory heap and allocate space for row_merge_rec_offsets().
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
@return memory heap */
static
mem_heap_t*
row_merge_heap_create(
/*==================*/
const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
ulint** offsets1, /*!< out: offsets */
ulint** offsets2) /*!< out: offsets */
{
ulint i = 1 + REC_OFFS_HEADER_SIZE
+ dict_index_get_n_fields(index);
- mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
- *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
- *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
+ *buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+ *offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+ *offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
(*offsets1)[0] = (*offsets2)[0] = i;
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
@@ -687,7 +696,9 @@ ibool
row_merge_read(
/*===========*/
int fd, /*!< in: file descriptor */
- ulint offset, /*!< in: offset where to read */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
row_merge_block_t* buf) /*!< out: data */
{
ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
@@ -704,6 +715,11 @@ row_merge_read(
(ulint) (ofs & 0xFFFFFFFF),
(ulint) (ofs >> 32),
sizeof *buf);
+#ifdef POSIX_FADV_DONTNEED
+ /* Each block is read exactly once. Free up the file cache. */
+ posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
if (UNIV_UNLIKELY(!success)) {
ut_print_timestamp(stderr);
fprintf(stderr,
@@ -714,18 +730,25 @@ row_merge_read(
}
/********************************************************************//**
-Read a merge block from the file system.
+Write a merge block to the file system.
@return TRUE if request was successful, FALSE if fail */
static
ibool
row_merge_write(
/*============*/
int fd, /*!< in: file descriptor */
- ulint offset, /*!< in: offset where to write */
+ ulint offset, /*!< in: offset where to write,
+ in number of row_merge_block_t elements */
const void* buf) /*!< in: data */
{
- ib_uint64_t ofs = ((ib_uint64_t) offset)
- * sizeof(row_merge_block_t);
+ size_t buf_len = sizeof(row_merge_block_t);
+ ib_uint64_t ofs = buf_len * (ib_uint64_t) offset;
+ ibool ret;
+
+ ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
+ (ulint) (ofs & 0xFFFFFFFF),
+ (ulint) (ofs >> 32),
+ buf_len);
#ifdef UNIV_DEBUG
if (row_merge_print_block_write) {
@@ -734,10 +757,13 @@ row_merge_write(
}
#endif /* UNIV_DEBUG */
- return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
- (ulint) (ofs & 0xFFFFFFFF),
- (ulint) (ofs >> 32),
- sizeof(row_merge_block_t))));
+#ifdef POSIX_FADV_DONTNEED
+ /* The block will be needed on the next merge pass,
+ but it can be evicted from the file cache meanwhile. */
+ posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+ return(UNIV_LIKELY(ret));
}
/********************************************************************//**
@@ -1072,11 +1098,14 @@ row_merge_cmp(
record to be compared */
const ulint* offsets1, /*!< in: first record offsets */
const ulint* offsets2, /*!< in: second record offsets */
- const dict_index_t* index) /*!< in: index */
+ const dict_index_t* index, /*!< in: index */
+ ibool* null_eq) /*!< out: set to TRUE if
+ found matching null values */
{
int cmp;
- cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
+ cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
+ null_eq);
#ifdef UNIV_DEBUG
if (row_merge_print_cmp) {
@@ -1394,7 +1423,8 @@ row_merge_blocks(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
const byte* b1; /*!< pointer to block[1] */
byte* b2; /*!< pointer to block[2] */
@@ -1414,7 +1444,7 @@ row_merge_blocks(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1441,11 +1471,13 @@ corrupt:
}
while (mrec0 && mrec1) {
+ ibool null_eq = FALSE;
switch (row_merge_cmp(mrec0, mrec1,
- offsets0, offsets1, index)) {
+ offsets0, offsets1, index,
+ &null_eq)) {
case 0:
if (UNIV_UNLIKELY
- (dict_index_is_unique(index))) {
+ (dict_index_is_unique(index) && !null_eq)) {
innobase_rec_to_mysql(table, mrec0,
index, offsets0);
mem_heap_free(heap);
@@ -1500,7 +1532,7 @@ row_merge_blocks_copy(
{
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
- mrec_buf_t buf[3]; /*!< buffer for handling
+ mrec_buf_t* buf; /*!< buffer for handling
split mrec in block[] */
const byte* b0; /*!< pointer to block[0] */
byte* b2; /*!< pointer to block[2] */
@@ -1518,7 +1550,7 @@ row_merge_blocks_copy(
}
#endif /* UNIV_DEBUG */
- heap = row_merge_heap_create(index, &offsets0, &offsets1);
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
/* Write a record and read the next record. Split the output
file in two halves, which can be merged on the following pass. */
@@ -1589,6 +1621,14 @@ row_merge(
of.offset = 0;
of.n_rec = 0;
+#ifdef POSIX_FADV_SEQUENTIAL
+ /* The input file will be read sequentially, starting from the
+ beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL
+ affects the entire file. Each block will be read exactly once. */
+ posix_fadvise(file->fd, 0, 0,
+ POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
/* Merge blocks to the output file. */
ohalf = 0;
foffs0 = 0;
@@ -1760,7 +1800,6 @@ row_merge_insert_index_tuples(
int fd, /*!< in: file descriptor */
row_merge_block_t* block) /*!< in/out: file buffer */
{
- mrec_buf_t buf;
const byte* b;
que_thr_t* thr;
ins_node_t* node;
@@ -1779,7 +1818,7 @@ row_merge_insert_index_tuples(
trx->op_info = "inserting index entries";
- graph_heap = mem_heap_create(500);
+ graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
node = ins_node_create(INS_DIRECT, table, graph_heap);
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
@@ -1801,12 +1840,14 @@ row_merge_insert_index_tuples(
if (!row_merge_read(fd, foffs, block)) {
error = DB_CORRUPTION;
} else {
+ mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
for (;;) {
const mrec_t* mrec;
dtuple_t* dtuple;
ulint n_ext;
- b = row_merge_read_rec(block, &buf, b, index,
+ b = row_merge_read_rec(block, buf, b, index,
fd, &foffs, &mrec, offsets);
if (UNIV_UNLIKELY(!b)) {
/* End of list, or I/O error */
@@ -1977,14 +2018,12 @@ row_merge_drop_index(
/* Drop the field definitions of the index. */
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
/* Drop the index definition and the B-tree. */
- "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
- " AND TABLE_ID = :tableid;\n"
+ "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
"END;\n";
ut_ad(index && table && trx);
pars_info_add_dulint_literal(info, "indexid", index->id);
- pars_info_add_dulint_literal(info, "tableid", table->id);
trx_start_if_not_started(trx);
trx->op_info = "dropping index";
@@ -2033,47 +2072,82 @@ row_merge_drop_temp_indexes(void)
/*=============================*/
{
trx_t* trx;
- ulint err;
-
- /* We use the private SQL parser of Innobase to generate the
- query graphs needed in deleting the dictionary data from system
- tables in Innobase. Deleting a row from SYS_INDEXES table also
- frees the file segments of the B-tree associated with the index. */
- static const char drop_temp_indexes[] =
- "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
- "indexid CHAR;\n"
- "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
- "WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
- "BEGIN\n"
- "\tOPEN c;\n"
- "\tWHILE 1=1 LOOP\n"
- "\t\tFETCH c INTO indexid;\n"
- "\t\tIF (SQL % NOTFOUND) THEN\n"
- "\t\t\tEXIT;\n"
- "\t\tEND IF;\n"
- "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
- "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
- "\tEND LOOP;\n"
- "\tCLOSE c;\n"
- "\tCOMMIT WORK;\n"
- "END;\n";
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
trx = trx_allocate_for_background();
trx->op_info = "dropping partially created indexes";
row_mysql_lock_data_dictionary(trx);
- /* Incomplete transactions may be holding some locks on the
- data dictionary tables. However, they should never have been
- able to lock the records corresponding to the partially
- created indexes that we are attempting to delete, because the
- table was locked when the indexes were being created. We will
- drop the partially created indexes before the rollback of
- incomplete transactions is initiated. Thus, this should not
- interfere with the incomplete transactions. */
- trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
- err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
- ut_a(err == DB_SUCCESS);
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_indexes),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dulint table_id;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+ &len);
+ if (len == UNIV_SQL_NULL || len == 0
+ || (char) *field != TEMP_INDEX_PREFIX) {
+ continue;
+ }
+
+ /* This is a temporary index. */
+
+ field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+ if (len != 8) {
+ /* Corrupted TABLE_ID */
+ continue;
+ }
+
+ table_id = mach_read_from_8(field);
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_table_get_on_id_low(table_id);
+
+ if (table) {
+ dict_index_t* index;
+ dict_index_t* next_index;
+
+ for (index = dict_table_get_first_index(table);
+ index; index = next_index) {
+
+ next_index = dict_table_get_next_index(index);
+
+ if (*index->name == TEMP_INDEX_PREFIX) {
+ row_merge_drop_index(index, table, trx);
+ trx_commit_for_mysql(trx);
+ }
+ }
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
row_mysql_unlock_data_dictionary(trx);
trx_free_for_background(trx);
}
@@ -2086,9 +2160,22 @@ row_merge_file_create(
/*==================*/
merge_file_t* merge_file) /*!< out: merge file structure */
{
+#ifdef UNIV_PFS_IO
+ /* This temp file open does not go through normal
+ file APIs, add instrumentation to register with
+ performance schema */
+ struct PSI_file_locker* locker = NULL;
+ register_pfs_file_open_begin(locker, innodb_file_temp_key,
+ PSI_FILE_OPEN,
+ "Innodb Merge Temp File",
+ __FILE__, __LINE__);
+#endif
merge_file->fd = innobase_mysql_tmpfile();
merge_file->offset = 0;
merge_file->n_rec = 0;
+#ifdef UNIV_PFS_IO
+ register_pfs_file_open_end(locker, merge_file->fd);
+#endif
}
/*********************************************************************//**
@@ -2099,10 +2186,19 @@ row_merge_file_destroy(
/*===================*/
merge_file_t* merge_file) /*!< out: merge file structure */
{
+#ifdef UNIV_PFS_IO
+ struct PSI_file_locker* locker = NULL;
+ register_pfs_file_io_begin(locker, merge_file->fd, 0, PSI_FILE_CLOSE,
+ __FILE__, __LINE__);
+#endif
if (merge_file->fd != -1) {
close(merge_file->fd);
merge_file->fd = -1;
}
+
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, 0);
+#endif
}
/*********************************************************************//**