diff options
Diffstat (limited to 'storage/innobase/row/row0merge.c')
-rw-r--r-- | storage/innobase/row/row0merge.c | 250 |
1 files changed, 173 insertions, 77 deletions
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c index 232211e5ce7..d9084bb4ffd 100644 --- a/storage/innobase/row/row0merge.c +++ b/storage/innobase/row/row0merge.c @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,6 +57,11 @@ Completed by Sunny Bains and Marko Makela #include "ut0sort.h" #include "handler0alter.h" +/* Ignore posix_fadvise() on those platforms where it does not exist */ +#if defined __WIN__ +# define posix_fadvise(fd, offset, len, advice) /* nothing */ +#endif /* __WIN__ */ + #ifdef UNIV_DEBUG /** Set these in order ot enable debug printout. */ /* @{ */ @@ -269,6 +274,7 @@ row_merge_buf_add( const dict_index_t* index; dfield_t* entry; dfield_t* field; + const dict_field_t* ifield; if (buf->n_tuples >= buf->max_tuples) { return(FALSE); @@ -287,14 +293,14 @@ row_merge_buf_add( data_size = 0; extra_size = UT_BITS_IN_BYTES(index->n_nullable); - for (i = 0; i < n_fields; i++, field++) { - const dict_field_t* ifield; + ifield = dict_index_get_nth_field(index, 0); + + for (i = 0; i < n_fields; i++, field++, ifield++) { const dict_col_t* col; ulint col_no; const dfield_t* row_field; ulint len; - ifield = dict_index_get_nth_field(index, i); col = ifield->col; col_no = dict_col_get_no(col); row_field = dtuple_get_nth_field(row, col_no); @@ -424,14 +430,13 @@ row_merge_dup_report( row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ const dfield_t* entry) /*!< in: duplicate index entry */ { - mrec_buf_t buf; + mrec_buf_t* buf; const dtuple_t* tuple; dtuple_t tuple_store; const rec_t* rec; const dict_index_t* index = dup->index; ulint n_fields= dict_index_get_n_fields(index); - mem_heap_t* heap = NULL; - ulint offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap; ulint* offsets; ulint n_ext; @@ -441,22 +446,22 @@ row_merge_dup_report( return; } - rec_offs_init(offsets_); - /* Convert the tuple to a record and then to MySQL format. */ + heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields) + * sizeof *offsets + + sizeof *buf); + + buf = mem_heap_alloc(heap, sizeof *buf); tuple = dtuple_from_fields(&tuple_store, entry, n_fields); n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0; - rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext); - offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, - &heap); + rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext); + offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap); innobase_rec_to_mysql(dup->table, rec, index, offsets); - if (UNIV_LIKELY_NULL(heap)) { - mem_heap_free(heap); - } + mem_heap_free(heap); } /*************************************************************//** @@ -627,22 +632,26 @@ row_merge_buf_write( } /******************************************************//** -Create a memory heap and allocate space for row_merge_rec_offsets(). +Create a memory heap and allocate space for row_merge_rec_offsets() +and mrec_buf_t[3]. @return memory heap */ static mem_heap_t* row_merge_heap_create( /*==================*/ const dict_index_t* index, /*!< in: record descriptor */ + mrec_buf_t** buf, /*!< out: 3 buffers */ ulint** offsets1, /*!< out: offsets */ ulint** offsets2) /*!< out: offsets */ { ulint i = 1 + REC_OFFS_HEADER_SIZE + dict_index_get_n_fields(index); - mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1); + mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1 + + 3 * sizeof **buf); - *offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1); - *offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2); + *buf = mem_heap_alloc(heap, 3 * sizeof **buf); + *offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1); + *offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2); (*offsets1)[0] = (*offsets2)[0] = i; (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index); @@ -687,7 +696,9 @@ ibool row_merge_read( /*===========*/ int fd, /*!< in: file descriptor */ - ulint offset, /*!< in: offset where to read */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ row_merge_block_t* buf) /*!< out: data */ { ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf; @@ -704,6 +715,11 @@ row_merge_read( (ulint) (ofs & 0xFFFFFFFF), (ulint) (ofs >> 32), sizeof *buf); +#ifdef POSIX_FADV_DONTNEED + /* Each block is read exactly once. Free up the file cache. */ + posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + if (UNIV_UNLIKELY(!success)) { ut_print_timestamp(stderr); fprintf(stderr, @@ -714,18 +730,25 @@ row_merge_read( } /********************************************************************//** -Read a merge block from the file system. +Write a merge block to the file system. @return TRUE if request was successful, FALSE if fail */ static ibool row_merge_write( /*============*/ int fd, /*!< in: file descriptor */ - ulint offset, /*!< in: offset where to write */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ const void* buf) /*!< in: data */ { - ib_uint64_t ofs = ((ib_uint64_t) offset) - * sizeof(row_merge_block_t); + size_t buf_len = sizeof(row_merge_block_t); + ib_uint64_t ofs = buf_len * (ib_uint64_t) offset; + ibool ret; + + ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, + (ulint) (ofs & 0xFFFFFFFF), + (ulint) (ofs >> 32), + buf_len); #ifdef UNIV_DEBUG if (row_merge_print_block_write) { @@ -734,10 +757,13 @@ row_merge_write( } #endif /* UNIV_DEBUG */ - return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, - (ulint) (ofs & 0xFFFFFFFF), - (ulint) (ofs >> 32), - sizeof(row_merge_block_t)))); +#ifdef POSIX_FADV_DONTNEED + /* The block will be needed on the next merge pass, + but it can be evicted from the file cache meanwhile. */ + posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + return(UNIV_LIKELY(ret)); } /********************************************************************//** @@ -1072,11 +1098,14 @@ row_merge_cmp( record to be compared */ const ulint* offsets1, /*!< in: first record offsets */ const ulint* offsets2, /*!< in: second record offsets */ - const dict_index_t* index) /*!< in: index */ + const dict_index_t* index, /*!< in: index */ + ibool* null_eq) /*!< out: set to TRUE if + found matching null values */ { int cmp; - cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index); + cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index, + null_eq); #ifdef UNIV_DEBUG if (row_merge_print_cmp) { @@ -1394,7 +1423,8 @@ row_merge_blocks( { mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ - mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */ + mrec_buf_t* buf; /*!< buffer for handling + split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ const byte* b1; /*!< pointer to block[1] */ byte* b2; /*!< pointer to block[2] */ @@ -1414,7 +1444,7 @@ row_merge_blocks( } #endif /* UNIV_DEBUG */ - heap = row_merge_heap_create(index, &offsets0, &offsets1); + heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); /* Write a record and read the next record. Split the output file in two halves, which can be merged on the following pass. */ @@ -1441,11 +1471,13 @@ corrupt: } while (mrec0 && mrec1) { + ibool null_eq = FALSE; switch (row_merge_cmp(mrec0, mrec1, - offsets0, offsets1, index)) { + offsets0, offsets1, index, + &null_eq)) { case 0: if (UNIV_UNLIKELY - (dict_index_is_unique(index))) { + (dict_index_is_unique(index) && !null_eq)) { innobase_rec_to_mysql(table, mrec0, index, offsets0); mem_heap_free(heap); @@ -1500,7 +1532,7 @@ row_merge_blocks_copy( { mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */ - mrec_buf_t buf[3]; /*!< buffer for handling + mrec_buf_t* buf; /*!< buffer for handling split mrec in block[] */ const byte* b0; /*!< pointer to block[0] */ byte* b2; /*!< pointer to block[2] */ @@ -1518,7 +1550,7 @@ row_merge_blocks_copy( } #endif /* UNIV_DEBUG */ - heap = row_merge_heap_create(index, &offsets0, &offsets1); + heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1); /* Write a record and read the next record. Split the output file in two halves, which can be merged on the following pass. */ @@ -1589,6 +1621,14 @@ row_merge( of.offset = 0; of.n_rec = 0; +#ifdef POSIX_FADV_SEQUENTIAL + /* The input file will be read sequentially, starting from the + beginning and the middle. In Linux, the POSIX_FADV_SEQUENTIAL + affects the entire file. Each block will be read exactly once. */ + posix_fadvise(file->fd, 0, 0, + POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE); +#endif /* POSIX_FADV_SEQUENTIAL */ + /* Merge blocks to the output file. */ ohalf = 0; foffs0 = 0; @@ -1760,7 +1800,6 @@ row_merge_insert_index_tuples( int fd, /*!< in: file descriptor */ row_merge_block_t* block) /*!< in/out: file buffer */ { - mrec_buf_t buf; const byte* b; que_thr_t* thr; ins_node_t* node; @@ -1779,7 +1818,7 @@ row_merge_insert_index_tuples( trx->op_info = "inserting index entries"; - graph_heap = mem_heap_create(500); + graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t)); node = ins_node_create(INS_DIRECT, table, graph_heap); thr = pars_complete_graph_for_exec(node, trx, graph_heap); @@ -1801,12 +1840,14 @@ row_merge_insert_index_tuples( if (!row_merge_read(fd, foffs, block)) { error = DB_CORRUPTION; } else { + mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf); + for (;;) { const mrec_t* mrec; dtuple_t* dtuple; ulint n_ext; - b = row_merge_read_rec(block, &buf, b, index, + b = row_merge_read_rec(block, buf, b, index, fd, &foffs, &mrec, offsets); if (UNIV_UNLIKELY(!b)) { /* End of list, or I/O error */ @@ -1977,14 +2018,12 @@ row_merge_drop_index( /* Drop the field definitions of the index. */ "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n" /* Drop the index definition and the B-tree. */ - "DELETE FROM SYS_INDEXES WHERE ID = :indexid\n" - " AND TABLE_ID = :tableid;\n" + "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n" "END;\n"; ut_ad(index && table && trx); pars_info_add_dulint_literal(info, "indexid", index->id); - pars_info_add_dulint_literal(info, "tableid", table->id); trx_start_if_not_started(trx); trx->op_info = "dropping index"; @@ -2033,47 +2072,82 @@ row_merge_drop_temp_indexes(void) /*=============================*/ { trx_t* trx; - ulint err; - - /* We use the private SQL parser of Innobase to generate the - query graphs needed in deleting the dictionary data from system - tables in Innobase. Deleting a row from SYS_INDEXES table also - frees the file segments of the B-tree associated with the index. */ - static const char drop_temp_indexes[] = - "PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n" - "indexid CHAR;\n" - "DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n" - "WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n" - "BEGIN\n" - "\tOPEN c;\n" - "\tWHILE 1=1 LOOP\n" - "\t\tFETCH c INTO indexid;\n" - "\t\tIF (SQL % NOTFOUND) THEN\n" - "\t\t\tEXIT;\n" - "\t\tEND IF;\n" - "\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n" - "\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n" - "\tEND LOOP;\n" - "\tCLOSE c;\n" - "\tCOMMIT WORK;\n" - "END;\n"; + btr_pcur_t pcur; + mtr_t mtr; + /* Load the table definitions that contain partially defined + indexes, so that the data dictionary information can be checked + when accessing the tablename.ibd files. */ trx = trx_allocate_for_background(); trx->op_info = "dropping partially created indexes"; row_mysql_lock_data_dictionary(trx); - /* Incomplete transactions may be holding some locks on the - data dictionary tables. However, they should never have been - able to lock the records corresponding to the partially - created indexes that we are attempting to delete, because the - table was locked when the indexes were being created. We will - drop the partially created indexes before the rollback of - incomplete transactions is initiated. Thus, this should not - interfere with the incomplete transactions. */ - trx->isolation_level = TRX_ISO_READ_UNCOMMITTED; - err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx); - ut_a(err == DB_SUCCESS); + mtr_start(&mtr); + + btr_pcur_open_at_index_side( + TRUE, + dict_table_get_first_index(dict_sys->sys_indexes), + BTR_SEARCH_LEAF, &pcur, TRUE, &mtr); + + for (;;) { + const rec_t* rec; + const byte* field; + ulint len; + dulint table_id; + dict_table_t* table; + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + + rec = btr_pcur_get_rec(&pcur); + field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD, + &len); + if (len == UNIV_SQL_NULL || len == 0 + || (char) *field != TEMP_INDEX_PREFIX) { + continue; + } + + /* This is a temporary index. */ + + field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len); + if (len != 8) { + /* Corrupted TABLE_ID */ + continue; + } + + table_id = mach_read_from_8(field); + btr_pcur_store_position(&pcur, &mtr); + btr_pcur_commit_specify_mtr(&pcur, &mtr); + + table = dict_table_get_on_id_low(table_id); + + if (table) { + dict_index_t* index; + dict_index_t* next_index; + + for (index = dict_table_get_first_index(table); + index; index = next_index) { + + next_index = dict_table_get_next_index(index); + + if (*index->name == TEMP_INDEX_PREFIX) { + row_merge_drop_index(index, table, trx); + trx_commit_for_mysql(trx); + } + } + } + + mtr_start(&mtr); + btr_pcur_restore_position(BTR_SEARCH_LEAF, + &pcur, &mtr); + } + + btr_pcur_close(&pcur); + mtr_commit(&mtr); row_mysql_unlock_data_dictionary(trx); trx_free_for_background(trx); } @@ -2086,9 +2160,22 @@ row_merge_file_create( /*==================*/ merge_file_t* merge_file) /*!< out: merge file structure */ { +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker* locker = NULL; + register_pfs_file_open_begin(locker, innodb_file_temp_key, + PSI_FILE_OPEN, + "Innodb Merge Temp File", + __FILE__, __LINE__); +#endif merge_file->fd = innobase_mysql_tmpfile(); merge_file->offset = 0; merge_file->n_rec = 0; +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, merge_file->fd); +#endif } /*********************************************************************//** @@ -2099,10 +2186,19 @@ row_merge_file_destroy( /*===================*/ merge_file_t* merge_file) /*!< out: merge file structure */ { +#ifdef UNIV_PFS_IO + struct PSI_file_locker* locker = NULL; + register_pfs_file_io_begin(locker, merge_file->fd, 0, PSI_FILE_CLOSE, + __FILE__, __LINE__); +#endif if (merge_file->fd != -1) { close(merge_file->fd); merge_file->fd = -1; } + +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, 0); +#endif } /*********************************************************************//** |