1 files changed, 173 insertions, 77 deletions
diff --git a/storage/innobase/row/row0merge.c b/storage/innobase/row/row0merge.c
index 232211e5ce7..d9084bb4ffd 100644
--- a/storage/innobase/row/row0merge.c
+++ b/storage/innobase/row/row0merge.c
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -57,6 +57,11 @@ Completed by Sunny Bains and Marko Makela
 #include "ut0sort.h"
 #include "handler0alter.h"
 
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined __WIN__
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* __WIN__ */
+
 #ifdef UNIV_DEBUG
 /** Set these in order ot enable debug printout. */
 /* @{ */
@@ -269,6 +274,7 @@ row_merge_buf_add(
 	const dict_index_t*	index;
 	dfield_t*		entry;
 	dfield_t*		field;
+	const dict_field_t*	ifield;
 
 	if (buf->n_tuples >= buf->max_tuples) {
 		return(FALSE);
@@ -287,14 +293,14 @@ row_merge_buf_add(
 	data_size = 0;
 	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
 
-	for (i = 0; i < n_fields; i++, field++) {
-		const dict_field_t*	ifield;
+	ifield = dict_index_get_nth_field(index, 0);
+
+	for (i = 0; i < n_fields; i++, field++, ifield++) {
 		const dict_col_t*	col;
 		ulint			col_no;
 		const dfield_t*		row_field;
 		ulint			len;
 
-		ifield = dict_index_get_nth_field(index, i);
 		col = ifield->col;
 		col_no = dict_col_get_no(col);
 		row_field = dtuple_get_nth_field(row, col_no);
@@ -424,14 +430,13 @@ row_merge_dup_report(
 	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
 	const dfield_t*		entry)	/*!< in: duplicate index entry */
 {
-	mrec_buf_t 		buf;
+	mrec_buf_t* 		buf;
 	const dtuple_t*		tuple;
 	dtuple_t		tuple_store;
 	const rec_t*		rec;
 	const dict_index_t*	index	= dup->index;
 	ulint			n_fields= dict_index_get_n_fields(index);
-	mem_heap_t*		heap	= NULL;
-	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	mem_heap_t*		heap;
 	ulint*			offsets;
 	ulint			n_ext;
 
@@ -441,22 +446,22 @@ row_merge_dup_report(
 		return;
 	}
 
-	rec_offs_init(offsets_);
-
 	/* Convert the tuple to a record and then to MySQL format. */
+	heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+			       * sizeof *offsets
+			       + sizeof *buf);
+
+	buf = mem_heap_alloc(heap, sizeof *buf);
 
 	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
 	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
 
-	rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
-	offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
-				  &heap);
+	rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
 
 	innobase_rec_to_mysql(dup->table, rec, index, offsets);
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
+	mem_heap_free(heap);
 }
 
 /*************************************************************//**
@@ -627,22 +632,26 @@ row_merge_buf_write(
 }
 
 /******************************************************//**
-Create a memory heap and allocate space for row_merge_rec_offsets().
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
 @return	memory heap */
 static
 mem_heap_t*
 row_merge_heap_create(
 /*==================*/
 	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
 	ulint**			offsets1,	/*!< out: offsets */
 	ulint**			offsets2)	/*!< out: offsets */
 {
 	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
 		+ dict_index_get_n_fields(index);
-	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof *offsets1);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
 
-	*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
-	*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
+	*buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+	*offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+	*offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
 
 	(*offsets1)[0] = (*offsets2)[0] = i;
 	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
@@ -687,7 +696,9 @@ ibool
 row_merge_read(
 /*===========*/
 	int			fd,	/*!< in: file descriptor */
-	ulint			offset,	/*!< in: offset where to read */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
 	row_merge_block_t*	buf)	/*!< out: data */
 {
 	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
@@ -704,6 +715,11 @@ row_merge_read(
 						 (ulint) (ofs & 0xFFFFFFFF),
 						 (ulint) (ofs >> 32),
 						 sizeof *buf);
+#ifdef POSIX_FADV_DONTNEED
+	/* Each block is read exactly once.  Free up the file cache. */
+	posix_fadvise(fd, ofs, sizeof *buf, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
 	if (UNIV_UNLIKELY(!success)) {
 		ut_print_timestamp(stderr);
 		fprintf(stderr,
@@ -714,18 +730,25 @@ row_merge_read(
 }
 
 /********************************************************************//**
-Read a merge block from the file system.
+Write a merge block to the file system.
 @return	TRUE if request was successful, FALSE if fail */
 static
 ibool
 row_merge_write(
 /*============*/
 	int		fd,	/*!< in: file descriptor */
-	ulint		offset,	/*!< in: offset where to write */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
 	const void*	buf)	/*!< in: data */
 {
-	ib_uint64_t	ofs = ((ib_uint64_t) offset)
-		* sizeof(row_merge_block_t);
+	size_t		buf_len = sizeof(row_merge_block_t);
+	ib_uint64_t	ofs = buf_len * (ib_uint64_t) offset;
+	ibool		ret;
+
+	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
+			    (ulint) (ofs & 0xFFFFFFFF),
+			    (ulint) (ofs >> 32),
+			    buf_len);
 
 #ifdef UNIV_DEBUG
 	if (row_merge_print_block_write) {
@@ -734,10 +757,13 @@ row_merge_write(
 	}
 #endif /* UNIV_DEBUG */
 
-	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
-					 (ulint) (ofs & 0xFFFFFFFF),
-					 (ulint) (ofs >> 32),
-					 sizeof(row_merge_block_t))));
+#ifdef POSIX_FADV_DONTNEED
+	/* The block will be needed on the next merge pass,
+	but it can be evicted from the file cache meanwhile. */
+	posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	return(UNIV_LIKELY(ret));
 }
 
 /********************************************************************//**
@@ -1072,11 +1098,14 @@ row_merge_cmp(
 						record to be compared */
 	const ulint*		offsets1,	/*!< in: first record offsets */
 	const ulint*		offsets2,	/*!< in: second record offsets */
-	const dict_index_t*	index)		/*!< in: index */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool*			null_eq)	/*!< out: set to TRUE if
+						found matching null values */
 {
 	int	cmp;
 
-	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
+	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
+				 null_eq);
 
 #ifdef UNIV_DEBUG
 	if (row_merge_print_cmp) {
@@ -1394,7 +1423,8 @@ row_merge_blocks(
 {
 	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
 
-	mrec_buf_t	buf[3];	/*!< buffer for handling split mrec in block[] */
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
 	const byte*	b0;	/*!< pointer to block[0] */
 	const byte*	b1;	/*!< pointer to block[1] */
 	byte*		b2;	/*!< pointer to block[2] */
@@ -1414,7 +1444,7 @@ row_merge_blocks(
 	}
 #endif /* UNIV_DEBUG */
 
-	heap = row_merge_heap_create(index, &offsets0, &offsets1);
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
 
 	/* Write a record and read the next record.  Split the output
 	file in two halves, which can be merged on the following pass. */
@@ -1441,11 +1471,13 @@ corrupt:
 	}
 
 	while (mrec0 && mrec1) {
+		ibool	null_eq = FALSE;
 		switch (row_merge_cmp(mrec0, mrec1,
-				      offsets0, offsets1, index)) {
+				      offsets0, offsets1, index,
+				      &null_eq)) {
 		case 0:
 			if (UNIV_UNLIKELY
-			    (dict_index_is_unique(index))) {
+			    (dict_index_is_unique(index) && !null_eq)) {
 				innobase_rec_to_mysql(table, mrec0,
 						      index, offsets0);
 				mem_heap_free(heap);
@@ -1500,7 +1532,7 @@ row_merge_blocks_copy(
 {
 	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
 
-	mrec_buf_t	buf[3];	/*!< buffer for handling
+	mrec_buf_t*	buf;	/*!< buffer for handling
 				split mrec in block[] */
 	const byte*	b0;	/*!< pointer to block[0] */
 	byte*		b2;	/*!< pointer to block[2] */
@@ -1518,7 +1550,7 @@ row_merge_blocks_copy(
 	}
 #endif /* UNIV_DEBUG */
 
-	heap = row_merge_heap_create(index, &offsets0, &offsets1);
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
 
 	/* Write a record and read the next record.  Split the output
 	file in two halves, which can be merged on the following pass. */
@@ -1589,6 +1621,14 @@ row_merge(
 	of.offset = 0;
 	of.n_rec = 0;
 
+#ifdef POSIX_FADV_SEQUENTIAL
+	/* The input file will be read sequentially, starting from the
+	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
+	affects the entire file.  Each block will be read exactly once. */
+	posix_fadvise(file->fd, 0, 0,
+		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
 	/* Merge blocks to the output file. */
 	ohalf = 0;
 	foffs0 = 0;
@@ -1760,7 +1800,6 @@ row_merge_insert_index_tuples(
 	int			fd,	/*!< in: file descriptor */
 	row_merge_block_t*	block)	/*!< in/out: file buffer */
 {
-	mrec_buf_t		buf;
 	const byte*		b;
 	que_thr_t*		thr;
 	ins_node_t*		node;
@@ -1779,7 +1818,7 @@ row_merge_insert_index_tuples(
 
 	trx->op_info = "inserting index entries";
 
-	graph_heap = mem_heap_create(500);
+	graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
 	node = ins_node_create(INS_DIRECT, table, graph_heap);
 
 	thr = pars_complete_graph_for_exec(node, trx, graph_heap);
@@ -1801,12 +1840,14 @@ row_merge_insert_index_tuples(
 	if (!row_merge_read(fd, foffs, block)) {
 		error = DB_CORRUPTION;
 	} else {
+		mrec_buf_t*	buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
 		for (;;) {
 			const mrec_t*	mrec;
 			dtuple_t*	dtuple;
 			ulint		n_ext;
 
-			b = row_merge_read_rec(block, &buf, b, index,
+			b = row_merge_read_rec(block, buf, b, index,
 					       fd, &foffs, &mrec, offsets);
 			if (UNIV_UNLIKELY(!b)) {
 				/* End of list, or I/O error */
@@ -1977,14 +2018,12 @@ row_merge_drop_index(
 		/* Drop the field definitions of the index. */
 		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
 		/* Drop the index definition and the B-tree. */
-		"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
-		"		AND TABLE_ID = :tableid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
 		"END;\n";
 
 	ut_ad(index && table && trx);
 
 	pars_info_add_dulint_literal(info, "indexid", index->id);
-	pars_info_add_dulint_literal(info, "tableid", table->id);
 
 	trx_start_if_not_started(trx);
 	trx->op_info = "dropping index";
@@ -2033,47 +2072,82 @@ row_merge_drop_temp_indexes(void)
 /*=============================*/
 {
 	trx_t*		trx;
-	ulint		err;
-
-	/* We use the private SQL parser of Innobase to generate the
-	query graphs needed in deleting the dictionary data from system
-	tables in Innobase. Deleting a row from SYS_INDEXES table also
-	frees the file segments of the B-tree associated with the index. */
-	static const char drop_temp_indexes[] =
-		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
-		"indexid CHAR;\n"
-		"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
-		"WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
-		"BEGIN\n"
-		"\tOPEN c;\n"
-		"\tWHILE 1=1 LOOP\n"
-		"\t\tFETCH c INTO indexid;\n"
-		"\t\tIF (SQL % NOTFOUND) THEN\n"
-		"\t\t\tEXIT;\n"
-		"\t\tEND IF;\n"
-		"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
-		"\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
-		"\tEND LOOP;\n"
-		"\tCLOSE c;\n"
-		"\tCOMMIT WORK;\n"
-		"END;\n";
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
 
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
 	trx = trx_allocate_for_background();
 	trx->op_info = "dropping partially created indexes";
 	row_mysql_lock_data_dictionary(trx);
 
-	/* Incomplete transactions may be holding some locks on the
-	data dictionary tables.  However, they should never have been
-	able to lock the records corresponding to the partially
-	created indexes that we are attempting to delete, because the
-	table was locked when the indexes were being created.  We will
-	drop the partially created indexes before the rollback of
-	incomplete transactions is initiated.  Thus, this should not
-	interfere with the incomplete transactions. */
-	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
-	err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
-	ut_a(err == DB_SUCCESS);
+	mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		TRUE,
+		dict_table_get_first_index(dict_sys->sys_indexes),
+		BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	for (;;) {
+		const rec_t*	rec;
+		const byte*	field;
+		ulint		len;
+		dulint		table_id;
+		dict_table_t*	table;
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+		field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+					      &len);
+		if (len == UNIV_SQL_NULL || len == 0
+		    || (char) *field != TEMP_INDEX_PREFIX) {
+			continue;
+		}
+
+		/* This is a temporary index. */
+
+		field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+		if (len != 8) {
+			/* Corrupted TABLE_ID */
+			continue;
+		}
+
+		table_id = mach_read_from_8(field);
 
+		btr_pcur_store_position(&pcur, &mtr);
+		btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+		table = dict_table_get_on_id_low(table_id);
+
+		if (table) {
+			dict_index_t*	index;
+			dict_index_t*	next_index;
+
+			for (index = dict_table_get_first_index(table);
+			     index; index = next_index) {
+
+				next_index = dict_table_get_next_index(index);
+
+				if (*index->name == TEMP_INDEX_PREFIX) {
+					row_merge_drop_index(index, table, trx);
+					trx_commit_for_mysql(trx);
+				}
+			}
+		}
+
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF,
+					  &pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
 	row_mysql_unlock_data_dictionary(trx);
 	trx_free_for_background(trx);
 }
@@ -2086,9 +2160,22 @@ row_merge_file_create(
 /*==================*/
 	merge_file_t*	merge_file)	/*!< out: merge file structure */
 {
+#ifdef UNIV_PFS_IO
+	/* This temp file open does not go through normal
+	file APIs, add instrumentation to register with
+	performance schema */
+	struct PSI_file_locker* locker = NULL;
+	register_pfs_file_open_begin(locker, innodb_file_temp_key,
+				     PSI_FILE_OPEN,
+				     "Innodb Merge Temp File",
+				     __FILE__, __LINE__);
+#endif
 	merge_file->fd = innobase_mysql_tmpfile();
 	merge_file->offset = 0;
 	merge_file->n_rec = 0;
+#ifdef UNIV_PFS_IO
+        register_pfs_file_open_end(locker, merge_file->fd);
+#endif
 }
 
 /*********************************************************************//**
@@ -2099,10 +2186,19 @@ row_merge_file_destroy(
 /*===================*/
 	merge_file_t*	merge_file)	/*!< out: merge file structure */
 {
+#ifdef UNIV_PFS_IO
+	struct PSI_file_locker* locker = NULL;
+	register_pfs_file_io_begin(locker, merge_file->fd, 0, PSI_FILE_CLOSE,
+				   __FILE__, __LINE__);
+#endif
 	if (merge_file->fd != -1) {
 		close(merge_file->fd);
 		merge_file->fd = -1;
 	}
+
+#ifdef UNIV_PFS_IO
+	register_pfs_file_io_end(locker, 0);
+#endif
 }
 
 /*********************************************************************//**