Many files:

Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/mysqld.cc: Change MySQL default isolation level to REPEATABLE READ; note that InnoDB has always had that default, and BDB and MyISAM always run at SERIALIZABLE level anyway sql/ha_innodb.cc: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution sql/ha_innodb.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/buf0buf.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/dict0dict.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/fil0fil.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/lock0lock.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0file.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0proc.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/os0thread.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0cur.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/page0page.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/read0read.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/srv0srv.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0rw.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/sync0sync.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0purge.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/trx0trx.h: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/include/rem0rec.ic: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0btr.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/btr/btr0pcur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0buf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/buf/buf0flu.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/dict/dict0dict.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fil/fil0fil.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/fsp/fsp0fsp.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/ibuf/ibuf0ibuf.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/lock/lock0lock.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/mem/mem0dbg.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0file.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/os/os0proc.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0cur.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/page/page0page.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/lexyy.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/pars/pars0grm.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/read/read0read.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0ins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0mysql.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0sel.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0uins.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0undo.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/row/row0upd.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0srv.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/srv/srv0start.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0rw.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/sync/sync0sync.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0purge.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution innobase/trx/trx0trx.c: Merge InnoDB-4.0.5: new isolation levels READ COMMITTED and READ UNCOMMITTED now supported, selective deadlock resolution
author: unknown <heikki@hundin.mysql.fi> 2002-10-29 23:16:46 +0200
committer: unknown <heikki@hundin.mysql.fi> 2002-10-29 23:16:46 +0200
commit: 3cb98f0d66c8030a3532b67ff74e7211cca4c079 (patch)
tree: e57bf300e559932ce45e0f749d7349577e7e0479 /innobase
parent: 2d9a473bb67eb5d46ef3facf9384e2b9a621b79e (diff)
download: mariadb-git-3cb98f0d66c8030a3532b67ff74e7211cca4c079.tar.gz
48 files changed, 1505 insertions, 557 deletions
diff --git a/innobase/btr/btr0btr.c b/innobase/btr/btr0btr.c
index 7a7678b2dcf..62a86d342a2 100644
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -274,6 +274,7 @@ btr_page_create(
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 			      				MTR_MEMO_PAGE_X_FIX));
 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 	
 	btr_page_set_index_id(page, tree->id, mtr);
 }
@@ -713,6 +714,7 @@ btr_create(
 	
 	/* Create a new index page on the the allocated segment page */
 	page = page_create(frame, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 
 	/* Set the index id of the page */
 	btr_page_set_index_id(page, index_id, mtr);
@@ -847,6 +849,7 @@ btr_page_reorganize_low(
 	segment headers, next page-field, etc.) is preserved intact */
 
 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 	
 	/* Copy the records from the temporary space to the recreated page;
 	do not copy the lock bits yet */
@@ -919,6 +922,7 @@ btr_page_empty(
 	segment headers, next page-field, etc.) is preserved intact */
 
 	page_create(page, mtr);
+	buf_block_align(page)->check_index_page_at_flush = TRUE;
 }
 
 /*****************************************************************
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
index 3d6b63def6c..24f0447d55d 100644
--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -121,16 +121,19 @@ btr_cur_latch_leaves(
 {
 	ulint	left_page_no;
 	ulint	right_page_no;
-
+	page_t*	get_page;
+	
 	ut_ad(tree && page && mtr);
 
 	if (latch_mode == BTR_SEARCH_LEAF) {
 	
-		btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 
 	} else if (latch_mode == BTR_MODIFY_LEAF) {
 
-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 
 	} else if (latch_mode == BTR_MODIFY_TREE) {
 
@@ -138,15 +141,22 @@ btr_cur_latch_leaves(
 		left_page_no = btr_page_get_prev(page, mtr);
 
 		if (left_page_no != FIL_NULL) {
-			btr_page_get(space, left_page_no, RW_X_LATCH, mtr);
+			get_page = btr_page_get(space, left_page_no,
+							RW_X_LATCH, mtr);
+			buf_block_align(get_page)->check_index_page_at_flush =
+									TRUE;
 		}
 				
-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 
 		right_page_no = btr_page_get_next(page, mtr);
 
 		if (right_page_no != FIL_NULL) {
-			btr_page_get(space, right_page_no, RW_X_LATCH, mtr);
+			get_page = btr_page_get(space, right_page_no,
+							RW_X_LATCH, mtr);
+			buf_block_align(get_page)->check_index_page_at_flush =
+									TRUE;
 		}
 
 	} else if (latch_mode == BTR_SEARCH_PREV) {
@@ -157,9 +167,12 @@ btr_cur_latch_leaves(
 		if (left_page_no != FIL_NULL) {
 			cursor->left_page = btr_page_get(space, left_page_no,
 							RW_S_LATCH, mtr);
+			buf_block_align(
+			cursor->left_page)->check_index_page_at_flush = TRUE;
 		}
 
-		btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 
 	} else if (latch_mode == BTR_MODIFY_PREV) {
 
@@ -169,9 +182,12 @@ btr_cur_latch_leaves(
 		if (left_page_no != FIL_NULL) {
 			cursor->left_page = btr_page_get(space, left_page_no,
 							RW_X_LATCH, mtr);
+			buf_block_align(
+			cursor->left_page)->check_index_page_at_flush = TRUE;
 		}
 
-		btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
+		buf_block_align(get_page)->check_index_page_at_flush = TRUE;
 	} else {
 		ut_error;
 	}
@@ -274,6 +290,7 @@ btr_cur_search_to_nth_level(
 	if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
 		&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
 		&& !estimate
+		&& mode != PAGE_CUR_LE_OR_EXTENDS
 	        && btr_search_guess_on_hash(index, info, tuple, mode,
 						latch_mode, cursor,
 						has_search_latch, mtr)) {
@@ -334,12 +351,18 @@ btr_cur_search_to_nth_level(
 	rw_latch = RW_NO_LATCH;
 	buf_mode = BUF_GET;
 
+	/* We use these modified search modes on non-leaf levels of the
+	B-tree. These let us end up in the right B-tree leaf. In that leaf
+	we use the original search mode. */
+
 	if (mode == PAGE_CUR_GE) {
 		page_mode = PAGE_CUR_L;
 	} else if (mode == PAGE_CUR_G) {
 		page_mode = PAGE_CUR_LE;
 	} else if (mode == PAGE_CUR_LE) {
 		page_mode = PAGE_CUR_LE;
+	} else if (mode == PAGE_CUR_LE_OR_EXTENDS) {
+		page_mode = PAGE_CUR_LE_OR_EXTENDS;
 	} else {
 		ut_ad(mode == PAGE_CUR_L);
 		page_mode = PAGE_CUR_L;
@@ -390,6 +413,8 @@ retry_page_get:
 
 			goto retry_page_get;
 		}
+
+		buf_block_align(page)->check_index_page_at_flush = TRUE;
 			
 #ifdef UNIV_SYNC_DEBUG					
 		if (rw_latch != RW_NO_LATCH) {
@@ -543,6 +568,8 @@ btr_cur_open_at_index_side(
 		ut_ad(0 == ut_dulint_cmp(tree->id,
 						btr_page_get_index_id(page)));
 
+		buf_block_align(page)->check_index_page_at_flush = TRUE;
+
 		if (height == ULINT_UNDEFINED) {
 			/* We are in the root node */
 
diff --git a/innobase/btr/btr0pcur.c b/innobase/btr/btr0pcur.c
index 8ca3d41f7f9..b2115dfdd6c 100644
--- a/innobase/btr/btr0pcur.c
+++ b/innobase/btr/btr0pcur.c
@@ -354,6 +354,7 @@ btr_pcur_move_to_next_page(
 	ut_ad(next_page_no != FIL_NULL);	
 
 	next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr);
+	buf_block_align(next_page)->check_index_page_at_flush = TRUE;
 
 	btr_leaf_page_release(page, cursor->latch_mode, mtr);
 	
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index ee8e8b91f8d..4524fa1a4f9 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -331,6 +331,11 @@ buf_page_print(
 						index->table_name,
 						index->name);
 		}
+	} else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) {
+		fprintf(stderr, "InnoDB: Page may be an 'inode' page\n");
+	} else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) {
+		fprintf(stderr,
+		"InnoDB: Page may be an insert buffer free list page\n");
 	}
 }
 
@@ -351,6 +356,8 @@ buf_block_init(
 	
 	block->file_page_was_freed = FALSE;
 
+	block->check_index_page_at_flush = FALSE;
+
 	rw_lock_create(&(block->lock));
 	ut_ad(rw_lock_validate(&(block->lock)));
 
@@ -617,6 +624,29 @@ buf_page_peek_block(
 }
 
 /************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/* in: space id */
+	ulint	offset)	/* in: page number */
+{
+	buf_block_t*	block;
+
+	mutex_enter_fast(&(buf_pool->mutex));
+
+	block = buf_page_hash_get(space, offset);
+
+	if (block) {
+		block->check_index_page_at_flush = FALSE;
+	}
+	
+	mutex_exit(&(buf_pool->mutex));
+}
+
+/************************************************************************
 Returns the current state of is_hashed of a page. FALSE if the page is
 not in the pool. NOTE that this operation does not fix the page in the
 pool if it is found there. */
@@ -1185,6 +1215,8 @@ buf_page_init(
 	block->space 		= space;
 	block->offset 		= offset;
 
+	block->check_index_page_at_flush = FALSE;
+	
 	block->lock_hash_val	= lock_rec_hash(space, offset);
 	block->lock_mutex	= NULL;
 	
diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
index 4c6850af078..78bde60c9b2 100644
--- a/innobase/buf/buf0flu.c
+++ b/innobase/buf/buf0flu.c
@@ -15,6 +15,7 @@ Created 11/11/1995 Heikki Tuuri
 
 #include "ut0byte.h"
 #include "ut0lst.h"
+#include "page0page.h"
 #include "fil0fil.h"
 #include "buf0buf.h"
 #include "buf0lru.h"
@@ -225,6 +226,24 @@ buf_flush_buffered_writes(void)
 		return;
 	}
 
+	for (i = 0; i < trx_doublewrite->first_free; i++) {
+		block = trx_doublewrite->buf_block_arr[i];
+
+		if (block->check_index_page_at_flush
+				&& !page_simple_validate(block->frame)) {
+
+			buf_page_print(block->frame);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+	"  InnoDB: Apparent corruption of an index page\n"
+	"InnoDB: to be written to data file. We intentionally crash server\n"
+	"InnoDB: to prevent corrupt data from ending up in data\n"
+	"InnoDB: files.\n");
+			ut_a(0);
+		}
+	}
+
 	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 	} else {
diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c
index 095c27f1c5f..2ee2c9d18a9 100644
--- a/innobase/dict/dict0dict.c
+++ b/innobase/dict/dict0dict.c
@@ -29,7 +29,14 @@ Created 1/8/1996 Heikki Tuuri
 
 dict_sys_t*	dict_sys	= NULL;	/* the dictionary system */
 
-rw_lock_t	dict_foreign_key_check_lock;
+rw_lock_t	dict_operation_lock;	/* table create, drop, etc. reserve
+					this in X-mode, implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode; we
+					cannot trust that MySQL protects
+					implicit or background operations
+					from dropping a table: this is our
+					mechanism */
 
 #define	DICT_HEAP_SIZE		100	/* initial memory heap size when
 					creating a table or index object */
@@ -509,9 +516,8 @@ dict_init(void)
 
 	UT_LIST_INIT(dict_sys->table_LRU);
 
-	rw_lock_create(&dict_foreign_key_check_lock);
-	rw_lock_set_level(&dict_foreign_key_check_lock,
-						SYNC_FOREIGN_KEY_CHECK);
+	rw_lock_create(&dict_operation_lock);
+	rw_lock_set_level(&dict_operation_lock, SYNC_DICT_OPERATION);
 }
 
 /**************************************************************************
@@ -1851,14 +1857,14 @@ loop:
 
 /*************************************************************************
 Accepts a specified string. Comparisons are case-insensitive. */
-static
+
 char*
 dict_accept(
 /*========*/
 			/* out: if string was accepted, the pointer
 			is moved after that, else ptr is returned */
 	char*	ptr,	/* in: scan from this */
-	const char* string,	/* in: accept only this string as the next
+	const char* string,/* in: accept only this string as the next
 			non-whitespace string */
 	ibool*	success)/* out: TRUE if accepted */
 {
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
index 3e0f21395ef..98980f6c337 100644
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@@ -967,6 +967,7 @@ fil_extend_last_data_file(
 	fil_node_t*	node;
 	fil_space_t*	space;
 	fil_system_t*	system		= fil_system;
+	byte*		buf2;
 	byte*		buf;
 	ibool		success;
 	ulint		i;
@@ -981,19 +982,23 @@ fil_extend_last_data_file(
 
 	fil_node_prepare_for_io(node, system, space);
 
-	buf = mem_alloc(1024 * 1024);
+	buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE);
+	buf = ut_align(buf2, UNIV_PAGE_SIZE);
 
 	memset(buf, '\0', 1024 * 1024);
 
 	for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) {
 
-		success = os_file_write(node->name, node->handle, buf,
+		/* If we use native Windows aio, then also this write is
+		done using it */
+
+		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+			node->name, node->handle, buf,
 			(node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF,
 			node->size >> (32 - UNIV_PAGE_SIZE_SHIFT),
-			1024 * 1024);
+			1024 * 1024, NULL, NULL);
 
 		if (!success) {
-
 			break;
 		}
 
@@ -1003,7 +1008,7 @@ fil_extend_last_data_file(
 		os_has_said_disk_full = FALSE;
 	}
 
-	mem_free(buf);
+	mem_free(buf2);
 
 	fil_node_complete_io(node, system, OS_FILE_WRITE);
 
@@ -1528,7 +1533,6 @@ fil_page_set_type(
 	ulint	type)	/* in: type */
 {
 	ut_ad(page);
-	ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_UNDO_LOG));
 
 	mach_write_to_2(page + FIL_PAGE_TYPE, type);
 }	
diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c
index 1abb043fdc2..ff586819d4a 100644
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -769,6 +769,8 @@ fsp_init_file_page_low(
 #endif
 	page = buf_frame_align(ptr);
 
+	buf_block_align(page)->check_index_page_at_flush = FALSE;	
+	
 #ifdef UNIV_BASIC_LOG_DEBUG	
 /*	printf("In log debug version: Erase the contents of the file page\n");
 */
@@ -1097,7 +1099,7 @@ fsp_fill_free_list(
 
 			/* Initialize the ibuf page in a separate
 			mini-transaction because it is low in the latching
-			order, and we must be able to release the its latch
+			order, and we must be able to release its latch
 			before returning from the fsp routine */
 			
 			mtr_start(&ibuf_mtr);
@@ -1264,7 +1266,12 @@ fsp_alloc_free_page(
 
 	free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
 						hint % FSP_EXTENT_SIZE, mtr);
-	ut_a(free != ULINT_UNDEFINED);
+	if (free == ULINT_UNDEFINED) {
+
+		ut_print_buf(((byte*)descr) - 500, 1000);
+
+		ut_a(0);
+	}
 
 	xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
 
@@ -1412,7 +1419,12 @@ fsp_free_extent(
 
 	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
 
-	ut_a(xdes_get_state(descr, mtr) != XDES_FREE);
+	if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+		ut_print_buf(((byte*)descr) - 500, 1000);
+
+		ut_a(0);
+	}
 
 	xdes_init(descr, mtr);
 
@@ -1523,6 +1535,10 @@ fsp_alloc_seg_inode_page(
 
 	page = buf_page_get(space, page_no, RW_X_LATCH, mtr);	
 
+	buf_block_align(page)->check_index_page_at_flush = FALSE;
+
+	fil_page_set_type(page, FIL_PAGE_INODE);
+	
 	buf_page_dbg_add_level(page, SYNC_FSP_PAGE);
 
 	for (i = 0; i < FSP_SEG_INODES_PER_PAGE; i++) {
@@ -2298,6 +2314,8 @@ fseg_alloc_free_page_low(
 		fseg_mark_page_used(seg_inode, space, ret_page, mtr);
 	}
 
+	buf_reset_check_index_page_at_flush(space, ret_page);
+	
 	return(ret_page);	
 }
 
diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
index b7d691485cc..143b3bfa584 100644
--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@@ -1295,6 +1295,8 @@ ibuf_add_free_page(
 	flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 		      page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
 
+	fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST);
+		      
 	ibuf_data->seg_size++;
 	ibuf_data->free_list_len++;
 
@@ -1305,6 +1307,7 @@ ibuf_add_free_page(
 
 	ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF,
 								TRUE, &mtr);
+
 	mtr_commit(&mtr);
 
 	mutex_exit(&ibuf_mutex);
diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
index 591c0ec54ab..f76c437bd1d 100644
--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -274,6 +274,15 @@ buf_page_peek_block(
 	ulint	space,	/* in: space id */
 	ulint	offset);/* in: page number */
 /************************************************************************
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/* in: space id */
+	ulint	offset);/* in: page number */
+/************************************************************************
 Sets file_page_was_freed TRUE if the page is found in the buffer pool.
 This function should be called when we free a file page and want the
 debug version to check that it is not accessed any more unless
@@ -648,6 +657,14 @@ struct buf_block_struct{
 					then it can wait for this rw-lock */
 	buf_block_t*	hash;		/* node used in chaining to the page
 					hash table */
+	ibool		check_index_page_at_flush;
+					/* TRUE if we know that this is
+					an index page, and want the database
+					to check its consistency before flush;
+					note that there may be pages in the
+					buffer pool which are index pages,
+					but this flag is not set because
+					we do not keep track of all pages */
 	/* 2. Page flushing fields */
 
 	UT_LIST_NODE_T(buf_block_t) flush_list;
diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h
index dd92c5aa467..b5e6e04a1de 100644
--- a/innobase/include/dict0dict.h
+++ b/innobase/include/dict0dict.h
@@ -26,6 +26,18 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0byte.h"
 #include "trx0types.h"
 
+/*************************************************************************
+Accepts a specified string. Comparisons are case-insensitive. */
+
+char*
+dict_accept(
+/*========*/
+			/* out: if string was accepted, the pointer
+			is moved after that, else ptr is returned */
+	char*	ptr,	/* in: scan from this */
+	const char* string,/* in: accept only this string as the next
+			non-whitespace string */
+	ibool*	success);/* out: TRUE if accepted */
 /************************************************************************
 Decrements the count of open MySQL handles to a table. */
 
@@ -798,7 +810,7 @@ dict_mutex_exit_for_mysql(void);
 
 
 extern dict_sys_t*	dict_sys;	/* the dictionary system */
-extern rw_lock_t	dict_foreign_key_check_lock;
+extern rw_lock_t	dict_operation_lock;
 
 /* Dictionary system struct */
 struct dict_sys_struct{
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
index 63e20221c16..23ef0304b2d 100644
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@@ -73,6 +73,8 @@ extern fil_addr_t	fil_addr_null;
 /* File page types */
 #define FIL_PAGE_INDEX		17855
 #define FIL_PAGE_UNDO_LOG	2
+#define FIL_PAGE_INODE		3
+#define FIL_PAGE_IBUF_FREE_LIST	4
 
 /* Space types */
 #define FIL_TABLESPACE 		501
diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h
index 288356d3270..d3b3d55d015 100644
--- a/innobase/include/lock0lock.h
+++ b/innobase/include/lock0lock.h
@@ -292,16 +292,12 @@ lock_sec_rec_modify_check_and_lock(
 	dict_index_t*	index,	/* in: secondary index */
 	que_thr_t*	thr);	/* in: query thread */
 /*************************************************************************
-Checks if locks of other transactions prevent an immediate read, or passing
-over by a read cursor, of a clustered index record. If they do, first tests
-if the query thread should anyway be suspended for some reason; if not, then
-puts the transaction and the query thread to the lock wait state and inserts a
-waiting request for a record lock to the lock queue. Sets the requested mode
-lock on the record. */
+Like the counterpart for a clustered index below, but now we read a
+secondary index record. */
 
 ulint
-lock_clust_rec_read_check_and_lock(
-/*===============================*/
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
 				/* out: DB_SUCCESS, DB_LOCK_WAIT,
 				DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 	ulint		flags,	/* in: if BTR_NO_LOCKING_FLAG bit is set,
@@ -309,18 +305,24 @@ lock_clust_rec_read_check_and_lock(
 	rec_t*		rec,	/* in: user record or page supremum record
 				which should be read or passed over by a read
 				cursor */
-	dict_index_t*	index,	/* in: clustered index */
+	dict_index_t*	index,	/* in: secondary index */
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr);	/* in: query thread */
 /*************************************************************************
-Like the counterpart for a clustered index above, but now we read a
-secondary index record. */
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. */
 
 ulint
-lock_sec_rec_read_check_and_lock(
-/*=============================*/
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
 				/* out: DB_SUCCESS, DB_LOCK_WAIT,
 				DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
 	ulint		flags,	/* in: if BTR_NO_LOCKING_FLAG bit is set,
@@ -328,10 +330,12 @@ lock_sec_rec_read_check_and_lock(
 	rec_t*		rec,	/* in: user record or page supremum record
 				which should be read or passed over by a read
 				cursor */
-	dict_index_t*	index,	/* in: secondary index */
+	dict_index_t*	index,	/* in: clustered index */
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr);	/* in: query thread */
 /*************************************************************************
 Checks that a record is seen in a consistent read. */
@@ -509,6 +513,7 @@ lock_validate(void);
 extern lock_sys_t*	lock_sys;
 
 /* Lock modes and types */
+/* Basic modes */
 #define	LOCK_NONE	0	/* this flag is used elsewhere to note
 				consistent read */
 #define	LOCK_IS		2	/* intention shared */
@@ -519,15 +524,20 @@ extern lock_sys_t*	lock_sys;
 				in an exclusive mode */
 #define LOCK_MODE_MASK	0xF	/* mask used to extract mode from the
 				type_mode field in a lock */
+/* Lock types */
 #define LOCK_TABLE	16	/* these type values should be so high that */
 #define	LOCK_REC	32	/* they can be ORed to the lock mode */
 #define LOCK_TYPE_MASK	0xF0	/* mask used to extract lock type from the
 				type_mode field in a lock */
+/* Waiting lock flag */
 #define LOCK_WAIT	256	/* this wait bit should be so high that
 				it can be ORed to the lock mode and type;
 				when this bit is set, it means that the
 				lock has not yet been granted, it is just
 				waiting for its turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/* this flag denotes an ordinary next-key lock
+				in contrast to LOCK_GAP or LOCK_REC_NOT_GAP */ 
 #define LOCK_GAP	512	/* this gap bit should be so high that
 				it can be ORed to the other flags;
 				when this bit is set, it means that the
@@ -537,7 +547,15 @@ extern lock_sys_t*	lock_sys;
 				the bit is set; locks of this type are created
 				when records are removed from the index chain
 				of records */
-#define LOCK_INSERT_INTENTION 1024 /* this bit is set when we place a waiting
+#define LOCK_REC_NOT_GAP 1024 	/* this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /* this bit is set when we place a waiting
 				gap type record lock request in order to let
 				an insert of an index record to wait until
 				there are no conflicting locks by other
diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h
index d65c7fd47e3..a7624a90d5e 100644
--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@@ -111,6 +111,7 @@ log. */
 #define OS_WIN31     1
 #define OS_WIN95     2	
 #define OS_WINNT     3
+#define OS_WIN2000   4
 
 extern ulint	os_n_file_reads;
 extern ulint	os_n_file_writes;
@@ -122,7 +123,7 @@ Gets the operating system version. Currently works only on Windows. */
 ulint
 os_get_os_version(void);
 /*===================*/
-                  /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+                  /* out: OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
 /********************************************************************
 Creates the seek mutexes used in positioned reads and writes. */
 
diff --git a/innobase/include/os0proc.h b/innobase/include/os0proc.h
index 9da1f33e070..79750e5c1f7 100644
--- a/innobase/include/os0proc.h
+++ b/innobase/include/os0proc.h
@@ -16,6 +16,15 @@ typedef void*			os_process_t;
 typedef unsigned long int	os_process_id_t;
 
 /********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void);
+/*====================*/
+/********************************************************************
 Allocates non-cacheable memory. */
 
 void*
diff --git a/innobase/include/os0thread.h b/innobase/include/os0thread.h
index 8355afa46e9..efc8651e06d 100644
--- a/innobase/include/os0thread.h
+++ b/innobase/include/os0thread.h
@@ -16,11 +16,8 @@ Created 9/8/1995 Heikki Tuuri
 this is also the size of the wait slot array for MySQL threads which
 can wait inside InnoDB */
 #ifdef __WIN__
-/* Windows 95/98/ME seemed to have difficulties creating the all
-the event semaphores for the wait array slots. If the computer had
-<= 64 MB memory, InnoDB startup could take minutes or even crash.
-That is why we set this to only 1000 in Windows. */
-
+/* Create less event semaphores because Win 98/ME had difficult creating
+40000 event semaphores */
 #define	OS_THREAD_MAX_N		1000
 #else
 #define	OS_THREAD_MAX_N		10000
diff --git a/innobase/include/page0cur.h b/innobase/include/page0cur.h
index 144e0e02b21..c3f0decdb4b 100644
--- a/innobase/include/page0cur.h
+++ b/innobase/include/page0cur.h
@@ -26,7 +26,12 @@ Created 10/4/1994 Heikki Tuuri
 #define	PAGE_CUR_GE	2
 #define	PAGE_CUR_L	3
 #define	PAGE_CUR_LE	4
-#define	PAGE_CUR_DBG	5
+#define PAGE_CUR_LE_OR_EXTENDS 5 /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+#define	PAGE_CUR_DBG	6
+
 
 extern ulint	page_cur_short_succ;
 
diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h
index 2f77127466f..b5e33af5bc0 100644
--- a/innobase/include/page0page.h
+++ b/innobase/include/page0page.h
@@ -666,6 +666,16 @@ page_rec_validate(
 			/* out: TRUE if ok */
 	rec_t* 	rec);	/* in: record on the page */
 /*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+			/* out: TRUE if ok */
+	page_t*	page);	/* in: index page */
+/*******************************************************************
 This function checks the consistency of an index page. */
 
 ibool
diff --git a/innobase/include/read0read.h b/innobase/include/read0read.h
index cebb2d6701c..db6bf888095 100644
--- a/innobase/include/read0read.h
+++ b/innobase/include/read0read.h
@@ -45,6 +45,14 @@ read_view_close(
 /*============*/
 	read_view_t*	view);	/* in: read view */
 /*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx);	/* in: trx which has a read view */
+/*************************************************************************
 Checks if a read view sees the specified transaction. */
 UNIV_INLINE
 ibool
diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h
index 12e3a8b39d6..b28f39925c1 100644
--- a/innobase/include/rem0rec.h
+++ b/innobase/include/rem0rec.h
@@ -148,12 +148,22 @@ data field in the record. */
 byte*
 rec_get_nth_field(
 /*==============*/
- 			/* out: pointer to the field, NULL if SQL null */
+ 			/* out: pointer to the field */
  	rec_t*	rec, 	/* in: record */
  	ulint	n,	/* in: index of the field */
 	ulint*	len);	/* out: length of the field; UNIV_SQL_NULL 
 			if SQL null */
 /****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ 			/* out: length of the field; UNIV_SQL_NULL if SQL
+			null */
+ 	rec_t*	rec, 	/* in: record */
+ 	ulint	n);	/* in: index of the field */
+/****************************************************************
 Gets the physical size of a field. Also an SQL null may have a field of
 size > 0, if the data type is of a fixed size. */
 UNIV_INLINE
diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic
index aaa3c58a003..9dfd4faeec8 100644
--- a/innobase/include/rem0rec.ic
+++ b/innobase/include/rem0rec.ic
@@ -65,6 +65,24 @@ a field stored to another page: */
 
 #define REC_2BYTE_EXTERN_MASK	0x4000
 
+/****************************************************************
+Return field length or UNIV_SQL_NULL. */
+UNIV_INLINE
+ulint
+rec_get_nth_field_len(
+/*==================*/
+ 			/* out: length of the field; UNIV_SQL_NULL if SQL
+			null */
+ 	rec_t*	rec, 	/* in: record */
+ 	ulint	n)	/* in: index of the field */
+{
+	ulint	len;
+
+	rec_get_nth_field(rec, n, &len);
+
+	return(len);
+}
+
 /***************************************************************
 Sets the value of the ith field SQL null bit. */
 
diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
index 9de5e9ccfc9..4d2768cf109 100644
--- a/innobase/include/srv0srv.h
+++ b/innobase/include/srv0srv.h
@@ -57,8 +57,6 @@ extern ulint	srv_flush_log_at_trx_commit;
 
 extern byte	srv_latin1_ordering[256];/* The sort order table of the latin1
 					character set */
-extern ibool	srv_use_native_aio;		
-
 extern ulint	srv_pool_size;
 extern ulint	srv_mem_pool_size;
 extern ulint	srv_lock_table_size;
@@ -70,8 +68,9 @@ extern dulint	srv_archive_recovery_limit_lsn;
 
 extern ulint	srv_lock_wait_timeout;
 
-extern char*    srv_unix_file_flush_method_str;
+extern char*    srv_file_flush_method_str;
 extern ulint    srv_unix_file_flush_method;
+extern ulint   	srv_win_file_flush_method;
 extern ulint	srv_force_recovery;
 extern ulint	srv_thread_concurrency;
 
@@ -154,13 +153,19 @@ typedef struct srv_sys_struct	srv_sys_t;
 /* The server system */
 extern srv_sys_t*	srv_sys;
 
-/* Alternatives for the field flush option in Unix; see the InnoDB manual about
+/* Alternatives for the file flush option in Unix; see the InnoDB manual about
 what these mean */
-#define SRV_UNIX_FDATASYNC   1
+#define SRV_UNIX_FDATASYNC   1	/* This is the default; it is currently mapped
+				to a call of fsync() because fdatasync()
+				seemed to corrupt files in Linux and Solaris */
 #define SRV_UNIX_O_DSYNC     2
 #define SRV_UNIX_LITTLESYNC  3
 #define SRV_UNIX_NOSYNC      4
 
+/* Alternatives for file i/o in Windows */
+#define SRV_WIN_IO_NORMAL		1
+#define SRV_WIN_IO_UNBUFFERED		2	/* This is the default */
+
 /* Alternatives for srv_force_recovery. Non-zero values are intended
 to help the user get a damaged database up so that he can dump intact
 tables and rows with SELECT INTO OUTFILE. The database must not otherwise
@@ -311,15 +316,17 @@ srv_conc_exit_innodb(
 	trx_t*	trx);	/* in: transaction object associated with the
 			thread */
 /*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
 
-ibool
+void
 srv_suspend_mysql_thread(
 /*=====================*/
-				/* out: TRUE if the lock wait timeout was
-				exceeded */
-	que_thr_t*	thr);	/* in: query thread associated with
-				the MySQL OS thread */
+	que_thr_t*	thr);	/* in: query thread associated with the MySQL
+				OS thread */
 /************************************************************************
 Releases a MySQL OS thread waiting for a lock to be released, if the
 thread is already suspended. */
@@ -407,3 +414,4 @@ struct srv_sys_struct{
 extern ulint	srv_n_threads_active[];
 
 #endif
+
diff --git a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
index 7ad38f5bc7f..5aa3dcdffc3 100644
--- a/innobase/include/sync0rw.h
+++ b/innobase/include/sync0rw.h
@@ -335,7 +335,8 @@ ibool
 rw_lock_own(
 /*========*/
 	rw_lock_t*	lock,		/* in: rw-lock */
-	ulint		lock_type);	/* in: lock type */
+	ulint		lock_type);	/* in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
 /**********************************************************************
 Checks if somebody has locked the rw-lock in the specified mode. */
 
diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
index 5bfa0bc2d48..320f8faf12d 100644
--- a/innobase/include/sync0sync.h
+++ b/innobase/include/sync0sync.h
@@ -371,10 +371,12 @@ or row lock! */
 #define SYNC_NO_ORDER_CHECK	3000	/* this can be used to suppress
 					latching order checking */
 #define	SYNC_LEVEL_NONE		2000	/* default: level not defined */
-#define	SYNC_FOREIGN_KEY_CHECK	1001
+#define	SYNC_DICT_OPERATION	1001	/* table create, drop, etc. reserve
+					this in X-mode, implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode */
 #define SYNC_DICT		1000
 #define SYNC_DICT_AUTOINC_MUTEX	999
-#define	SYNC_PURGE_IS_RUNNING	997
 #define SYNC_DICT_HEADER	995
 #define SYNC_IBUF_HEADER	914
 #define SYNC_IBUF_PESS_INSERT_MUTEX 912
diff --git a/innobase/include/trx0purge.h b/innobase/include/trx0purge.h
index 087be2f060e..049c79aec9b 100644
--- a/innobase/include/trx0purge.h
+++ b/innobase/include/trx0purge.h
@@ -111,9 +111,6 @@ struct trx_purge_struct{
 					of the trx system and it never ends */
 	que_t*		query;		/* The query graph which will do the
 					parallelized purge operation */
-	rw_lock_t	purge_is_running;/* Purge operation set an x-latch here
-					while it is accessing a table: this
-					prevents dropping of the table */
 	rw_lock_t	latch;		/* The latch protecting the purge view.
 					A purge operation must acquire an
 					x-latch here for the instant at which
diff --git a/innobase/include/trx0trx.h b/innobase/include/trx0trx.h
index 9b29c481b6d..874b126e47c 100644
--- a/innobase/include/trx0trx.h
+++ b/innobase/include/trx0trx.h
@@ -327,6 +327,7 @@ struct trx_struct{
         time_t          start_time;     /* time the trx object was created
                                         or the state last time became
                                         TRX_ACTIVE */
+	ulint		isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
 	ibool		check_foreigns;	/* normally TRUE, but if the user
 					wants to suppress foreign key checks,
 					(in table imports, for example) we
@@ -350,6 +351,9 @@ struct trx_struct{
 	/*------------------------------*/
         void*           mysql_thd;      /* MySQL thread handle corresponding
                                         to this trx, or NULL */
+	char**		mysql_query_str;/* pointer to the field in mysqld_thd
+					which contains the pointer to the
+					current SQL query string */
 	char*		mysql_log_file_name;
 					/* if MySQL binlog is used, this field
 					contains a pointer to the latest file
@@ -371,6 +375,9 @@ struct trx_struct{
 					replication has processed */
 	os_thread_id_t	mysql_thread_id;/* id of the MySQL thread associated
 					with this transaction object */
+	ulint		mysql_process_no;/* since in Linux, 'top' reports
+					process id's and not thread id's, we
+					store the process number too */
 	/*------------------------------*/
 	ulint		n_mysql_tables_in_use; /* number of Innobase tables
 					used in the processing of the current
@@ -379,9 +386,9 @@ struct trx_struct{
                                         /* how many tables the current SQL
 					statement uses, except those
 					in consistent read */
-	ibool		has_dict_foreign_key_check_lock;
+	ibool		has_dict_operation_lock;
 					/* TRUE if the trx currently holds
-					an s-lock on dict_foreign_... */
+					an s-lock on dict_operation_lock */
         ibool           has_search_latch;
 			                /* TRUE if this trx has latched the
 			                search system latch in S-mode */
@@ -523,6 +530,41 @@ struct trx_struct{
 #define TRX_QUE_ROLLING_BACK	3	/* transaction is rolling back */
 #define TRX_QUE_COMMITTING	4	/* transaction is committing */
 
+/* Transaction isolation levels */
+#define TRX_ISO_READ_UNCOMMITTED	1	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		2	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		3	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		4	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
 /* Types of a trx signal */
 #define TRX_SIG_NO_SIGNAL		100
 #define TRX_SIG_TOTAL_ROLLBACK		1
diff --git a/innobase/lock/lock0lock.c b/innobase/lock/lock0lock.c
index 866fe556af9..92ee5ee6cbe 100644
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@@ -70,6 +70,11 @@ A waiting record lock can also be of the gap type. A waiting lock request
 can be granted when there is no conflicting mode lock request by another
 transaction ahead of it in the explicit lock queue.
 
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
 -------------------------------------------------------------------------
 RULE 1: If there is an implicit x-lock on a record, and there are non-gap
 -------
@@ -294,7 +299,9 @@ struct lock_struct{
 	UT_LIST_NODE_T(lock_t)		
 			trx_locks;	/* list of the locks of the
 					transaction */
-	ulint		type_mode;	/* lock type, mode, gap flag, and
+	ulint		type_mode;	/* lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
 					wait flag, ORed */
 	hash_node_t	hash;		/* hash chain node for a record lock */
 	dict_index_t*	index;		/* index for a record lock */
@@ -309,6 +316,10 @@ Monitor will then fetch it and print */
 ibool	lock_deadlock_found = FALSE;
 char*	lock_latest_err_buf;		/* We allocate 5000 bytes for this */
 
+/* Flags for recursive deadlock search */
+#define LOCK_VICTIM_IS_START	1
+#define LOCK_VICTIM_IS_OTHER	2
+
 /************************************************************************
 Checks if a lock request results in a deadlock. */
 static
@@ -700,23 +711,23 @@ lock_rec_get_gap(
 }
 
 /*************************************************************************
-Sets the gap flag of a record lock. */
+Gets the LOCK_REC_NOT_GAP flag of a record lock. */
 UNIV_INLINE
-void
-lock_rec_set_gap(
-/*=============*/
-	lock_t*	lock,	/* in: record lock */
-	ibool	val)	/* in: value to set: TRUE or FALSE */
+ibool
+lock_rec_get_rec_not_gap(
+/*=====================*/
+			/* out: TRUE if LOCK_REC_NOT_GAP flag set */
+	lock_t*	lock)	/* in: record lock */
 {
 	ut_ad(lock);
-	ut_ad((val == TRUE) || (val == FALSE));
 	ut_ad(lock_get_type(lock) == LOCK_REC);
 
-	if (val) {
- 		lock->type_mode = lock->type_mode | LOCK_GAP;
-	} else {
-		lock->type_mode = lock->type_mode & ~LOCK_GAP;
+	if (lock->type_mode & LOCK_REC_NOT_GAP) {
+
+		return(TRUE);
 	}
+
+	return(FALSE);
 }
 
 /*************************************************************************
@@ -740,26 +751,6 @@ lock_rec_get_insert_intention(
 }
 
 /*************************************************************************
-Sets the waiting insert flag of a record lock. */
-UNIV_INLINE
-void
-lock_rec_set_insert_intention(
-/*==========================*/
-	lock_t*	lock,	/* in: record lock */
-	ibool	val)	/* in: value to set: TRUE or FALSE */
-{
-	ut_ad(lock);
-	ut_ad((val == TRUE) || (val == FALSE));
-	ut_ad(lock_get_type(lock) == LOCK_REC);
-
-	if (val) {
- 		lock->type_mode = lock->type_mode | LOCK_INSERT_INTENTION;
-	} else {
-		lock->type_mode = lock->type_mode & ~LOCK_INSERT_INTENTION;
-	}
-}
-
-/*************************************************************************
 Calculates if lock mode 1 is stronger or equal to lock mode 2. */
 UNIV_INLINE
 ibool
@@ -848,48 +839,53 @@ lock_rec_has_to_wait(
 			/* out: TRUE if new lock has to wait for lock2 to be
 			removed */
 	trx_t*	trx,	/* in: trx of new lock */
-	ulint	mode,	/* in: LOCK_S or LOCK_X */
-	ulint	gap,	/* in: LOCK_GAP or 0 */
-	ulint	insert_intention,
-			/* in: LOCK_INSERT_INTENTION or 0 */
+	ulint	type_mode,/* in: precise mode of the new lock to set:
+			LOCK_S or LOCK_X, possibly ORed to
+			LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION */
 	lock_t*	lock2)	/* in: another record lock; NOTE that it is assumed
 			that this has a lock bit set on the same record as
-			in lock1 */
+			in the new lock we are setting */
 {
 	ut_ad(trx && lock2);
 	ut_ad(lock_get_type(lock2) == LOCK_REC);
-	ut_ad(mode == LOCK_S || mode == LOCK_X);
-	ut_ad(gap == LOCK_GAP || gap == 0);
-	ut_ad(insert_intention == LOCK_INSERT_INTENTION
-	      				|| insert_intention == 0);
 
-	if (trx != lock2->trx && !lock_mode_compatible(mode,
+	if (trx != lock2->trx
+	    && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
 				     		lock_get_mode(lock2))) {
 
-		/* We have somewhat complex rules when gap type
-		record locks cause waits */
+		/* We have somewhat complex rules when gap type record locks
+		cause waits */
 
-		if (!gap && lock_rec_get_insert_intention(lock2)) {
+		if ((type_mode & LOCK_REC_NOT_GAP)
+						&& lock_rec_get_gap(lock2)) {
+			/* Lock on just the record does not need to wait for
+			a gap type lock */
+
+			return(FALSE);
+		}
+
+		if ((type_mode & LOCK_GAP)
+					&& lock_rec_get_rec_not_gap(lock2)) {
+		
+			/* Lock on gap does not need to wait for
+			a LOCK_REC_NOT_GAP type lock */
 
-			/* Request of a full next-key record does not
-			need to wait for an insert intention lock to be
-			removed. This is ok since our rules allow conflicting
-			locks on gaps. This eliminates a spurious deadlock
-			caused by a next-key lock waiting for an insert
-			intention lock; when the insert intention lock was
-			granted, the insert deadlocked on the waiting
-			next-key lock. */
-				
 			return(FALSE);
 		}
 
-		if (insert_intention && lock_rec_get_insert_intention(lock2)) {
+		if (lock_rec_get_insert_intention(lock2)) {
 
-			/* An insert intention is not disturbed by another
-			insert intention; this removes a spurious deadlock
-			caused by inserts which had to wait for a next-key
-			lock to be removed */
+			/* No lock request needs to wait for an insert
+			intention lock to be removed. This is ok since our
+			rules allow conflicting locks on gaps. This eliminates
+			a spurious deadlock caused by a next-key lock waiting
+			for an insert intention lock; when the insert
+			intention lock was granted, the insert deadlocked on
+			the waiting next-key lock.
 
+			Also, insert intention locks do not disturb each
+			other. */
+				
 			return(FALSE);
 		}
 
@@ -921,10 +917,7 @@ lock_has_to_wait(
 			ut_ad(lock_get_type(lock2) == LOCK_REC);
 				
 			return(lock_rec_has_to_wait(lock1->trx,
-				lock_get_mode(lock1),
-				lock_rec_get_gap(lock1),
-				lock_rec_get_insert_intention(lock1),
-				lock2));
+						lock1->type_mode, lock2));
 		}
 
 		return(TRUE);
@@ -1386,32 +1379,41 @@ lock_table_has(
 /*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
 
 /*************************************************************************
-Checks if a transaction has a GRANTED explicit lock on rec, where the gap
-flag or the insert intention flag is not set, stronger or equal to mode.
-Note that locks on the supremum of a page are a special case here, since
-they are always gap type locks, even if the gap flag is not set in them. */
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode. */
 UNIV_INLINE
 lock_t*
 lock_rec_has_expl(
 /*==============*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode */
+	ulint	precise_mode,/* in: LOCK_S or LOCK_X possibly ORed to
+			LOCK_GAP or LOCK_REC_NOT_GAP,
+			for a supremum record we regard this always a gap
+			type request */
 	rec_t*	rec,	/* in: record */
 	trx_t*	trx)	/* in: transaction */
 {
 	lock_t*	lock;
-	
-	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
 
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+	
 	lock = lock_rec_get_first(rec);
 
 	while (lock) {
 		if (lock->trx == trx
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock),
+		    				precise_mode & LOCK_MODE_MASK)
 		    && !lock_get_wait(lock)
-		    && !lock_rec_get_insert_intention(lock)
-		    && !lock_rec_get_gap(lock)) {
+		    && (!lock_rec_get_rec_not_gap(lock)
+		    		|| (precise_mode & LOCK_REC_NOT_GAP)
+		    		|| page_rec_is_supremum(rec))
+		    && (!lock_rec_get_gap(lock)
+				|| (precise_mode & LOCK_GAP)
+				|| page_rec_is_supremum(rec))
+		    && (!lock_rec_get_insert_intention(lock))) {
 
 		    	return(lock);
 		}
@@ -1429,7 +1431,7 @@ lock_t*
 lock_rec_other_has_expl_req(
 /*========================*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode */
+	ulint	mode,	/* in: LOCK_S or LOCK_X */
 	ulint	gap,	/* in: LOCK_GAP if also gap locks are taken
 			into account, or 0 if not */
 	ulint	wait,	/* in: LOCK_WAIT if also waiting locks are
@@ -1471,27 +1473,21 @@ lock_t*
 lock_rec_other_has_conflicting(
 /*===========================*/
 			/* out: lock or NULL */
-	ulint	mode,	/* in: lock mode of the lock we are going to reserve */
-	ulint	gap,	/* in: LOCK_GAP if we are going to reserve a gap type
-			lock, else 0 */
-	ulint	insert_intention,
-			/* in: LOCK_INSERT_INTENTION if we are going to
-			reserve an insert intention lock */
+	ulint	mode,	/* in: LOCK_S or LOCK_X,
+			possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+			LOCK_INSERT_INTENTION */
 	rec_t*	rec,	/* in: record to look at */	
 	trx_t*	trx)	/* in: our transaction */
 {
 	lock_t*	lock;
 	
 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad(mode == LOCK_X || mode == LOCK_S);
-	ut_ad(gap == 0 || gap == LOCK_GAP);
-	ut_ad(insert_intention == LOCK_INSERT_INTENTION
-						|| insert_intention == 0);
+
 	lock = lock_rec_get_first(rec);
 
 	while (lock) {
-		if (lock_rec_has_to_wait(trx, mode, gap, insert_intention,
-								lock)) {
+		if (lock_rec_has_to_wait(trx, mode, lock)) {
+
 			return(lock);
 		}
 		
@@ -1607,14 +1603,14 @@ lock_rec_create(
 	page_no	= buf_frame_get_page_no(page);
 	heap_no = rec_get_heap_no(rec);
 
-	/* If rec is the supremum record, then we reset the gap bit, as
-	all locks on the supremum are automatically of the gap type, and
-	we try to avoid unnecessary memory consumption of a new record lock
-	struct for a gap type lock */
+	/* If rec is the supremum record, then we reset the gap and
+	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+	automatically of the gap type */
 
 	if (rec == page_get_supremum_rec(page)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
 
-		type_mode = type_mode & ~LOCK_GAP;
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}
 
 	/* Make lock bitmap bigger by a safety margin */
@@ -1666,10 +1662,14 @@ ulint
 lock_rec_enqueue_waiting(
 /*=====================*/
 				/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
-				DB_QUE_THR_SUSPENDED */
+				DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+				DB_SUCCESS means that there was a deadlock,
+				but another transaction was chosen as a
+				victim, and we got the lock immediately:
+				no need to wait then */
 	ulint		type_mode,/* in: lock mode this transaction is
-				requesting: LOCK_S or LOCK_X, ORed with
-				LOCK_GAP if a gap lock is requested, ORed
+				requesting: LOCK_S or LOCK_X, possibly ORed
+				with LOCK_GAP or LOCK_REC_NOT_GAP, ORed
 				with LOCK_INSERT_INTENTION if this waiting
 				lock request is set when performing an
 				insert of an index record */
@@ -1718,6 +1718,14 @@ index->table_name);
 		return(DB_DEADLOCK);
 	}
 
+	/* If there was a deadlock but we chose another transaction as a
+	victim, it is possible that we already have the lock now granted! */
+
+	if (trx->wait_lock == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
 	trx->que_state = TRX_QUE_LOCK_WAIT;
 	trx->wait_started = time(NULL);
 
@@ -1744,8 +1752,8 @@ lock_rec_add_to_queue(
 /*==================*/
 				/* out: lock where the bit was set, NULL if out
 				of memory */
-	ulint		type_mode,/* in: lock mode, wait, and gap flags; type
-				is ignored and replaced by LOCK_REC */
+	ulint		type_mode,/* in: lock mode, wait, gap etc. flags;
+				type is ignored and replaced by LOCK_REC */
 	rec_t*		rec,	/* in: record on page */
 	dict_index_t*	index,	/* in: index of record */
 	trx_t*		trx)	/* in: transaction */
@@ -1759,12 +1767,11 @@ lock_rec_add_to_queue(
 	ut_ad(mutex_own(&kernel_mutex));
 	ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
 	      || ((type_mode & LOCK_MODE_MASK) != LOCK_S)
-	      || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT,
-						rec, trx));
+	      || !lock_rec_other_has_expl_req(LOCK_X, 0, LOCK_WAIT, rec, trx));
 	ut_ad((type_mode & (LOCK_WAIT | LOCK_GAP))
 	      || ((type_mode & LOCK_MODE_MASK) != LOCK_X)
-	      || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
-						rec, trx));
+	      || !lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT, rec, trx));
+
 	type_mode = type_mode | LOCK_REC;
 
 	page = buf_frame_align(rec);
@@ -1775,12 +1782,15 @@ lock_rec_add_to_queue(
 	struct for a gap type lock */
 
 	if (rec == page_get_supremum_rec(page)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
 
-		type_mode = type_mode & ~LOCK_GAP;
+		/* There should never be LOCK_REC_NOT_GAP on a supremum
+		record, but let us play safe */
+		
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}
 
-	/* Look for a waiting lock request on the same record, or for a
-	similar record lock on the same page */
+	/* Look for a waiting lock request on the same record or on a gap */
 
 	heap_no = rec_get_heap_no(rec);
 	lock = lock_rec_get_first_on_page(rec);
@@ -1795,6 +1805,9 @@ lock_rec_add_to_queue(
 		lock = lock_rec_get_next_on_page(lock);
 	}
 
+	/* Look for a similar record lock on the same page: if one is found
+	and there are no waiting lock requests, we can just set the bit */
+
 	similar_lock = lock_rec_find_similar_on_page(type_mode, rec, trx);
 
 	if (similar_lock && !somebody_waits && !(type_mode & LOCK_WAIT)) {
@@ -1822,7 +1835,8 @@ lock_rec_lock_fast(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait
 				is necessary: we assume that the caller will
 				set an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@@ -1831,8 +1845,16 @@ lock_rec_lock_fast(
 	ulint	heap_no;
 
 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
-
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+			
 	heap_no = rec_get_heap_no(rec);
 	
 	lock = lock_rec_get_first_on_page(rec);
@@ -1877,7 +1899,8 @@ lock_rec_lock_slow(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait is
 				necessary: we assume that the caller will set
 				an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@@ -1886,20 +1909,24 @@ lock_rec_lock_slow(
 	ulint	err;
 
 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode == LOCK_X) || (mode == LOCK_S));
-
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+			
 	trx = thr_get_trx(thr);
-
-	ut_ad((mode != LOCK_S) || lock_table_has(trx, index->table,
-								LOCK_IS));
-	ut_ad((mode != LOCK_X) || lock_table_has(trx, index->table,
-								LOCK_IX));
+		
 	if (lock_rec_has_expl(mode, rec, trx)) {
 		/* The trx already has a strong enough lock on rec: do
 		nothing */
 
 		err = DB_SUCCESS;
-	} else if (lock_rec_other_has_conflicting(mode, 0, 0, rec, trx)) {
+	} else if (lock_rec_other_has_conflicting(mode, rec, trx)) {
 
 		/* If another transaction has a non-gap conflicting request in
 		the queue, as this transaction does not have a lock strong
@@ -1935,7 +1962,8 @@ lock_rec_lock(
 	ibool		impl,	/* in: if TRUE, no lock is set if no wait is
 				necessary: we assume that the caller will set
 				an implicit lock */
-	ulint		mode,	/* in: lock mode */
+	ulint		mode,	/* in: lock mode: LOCK_X or LOCK_S possibly
+				ORed to either LOCK_GAP or LOCK_REC_NOT_GAP */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index of record */
 	que_thr_t* 	thr)	/* in: query thread */
@@ -1943,11 +1971,16 @@ lock_rec_lock(
 	ulint	err;
 
 	ut_ad(mutex_own(&kernel_mutex));
-	ut_ad((mode != LOCK_S) || lock_table_has(thr_get_trx(thr),
-						index->table, LOCK_IS));
-	ut_ad((mode != LOCK_X) || lock_table_has(thr_get_trx(thr),
-						index->table, LOCK_IX));
-
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+		|| lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+		|| (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+			|| mode - (LOCK_MODE_MASK & mode) == 0);
+			
 	if (lock_rec_lock_fast(impl, mode, rec, index, thr)) {
 
 		/* We try a simplified and faster subroutine for the most
@@ -2011,26 +2044,33 @@ lock_grant(
 	ut_ad(mutex_own(&kernel_mutex));
 
 	lock_reset_lock_and_trx_wait(lock);
-	
-	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
 
-	         if (lock->trx->auto_inc_lock != NULL) {
-	                 fprintf(stderr,
-		    "InnoDB: Error: trx already had an AUTO-INC lock!\n");
-	         }
+        if (lock_get_mode(lock) == LOCK_AUTO_INC) {
 
-	         /* Store pointer to lock to trx so that we know to
-	         release it at the end of the SQL statement */
+                if (lock->trx->auto_inc_lock != NULL) {
+                        fprintf(stderr,
+                   "InnoDB: Error: trx already had an AUTO-INC lock!\n");
+                }
 
-	         lock->trx->auto_inc_lock = lock;
-	}
+                /* Store pointer to lock to trx so that we know to
+                release it at the end of the SQL statement */
+
+                lock->trx->auto_inc_lock = lock;
+        }
 
 	if (lock_print_waits) {
 		printf("Lock wait for trx %lu ends\n",
 					ut_dulint_get_low(lock->trx->id));
 	}
+
+	/* If we are resolving a deadlock by choosing another transaction
+	as a victim, then our original transaction may not be in the
+	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+	for it */
 	
-	trx_end_lock_wait(lock->trx);
+	if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {	
+		trx_end_lock_wait(lock->trx);
+	}
 }
 
 /*****************************************************************
@@ -2080,7 +2120,7 @@ lock_rec_dequeue_from_page(
 	ut_ad(lock_get_type(in_lock) == LOCK_REC);
 
 	trx = in_lock->trx;
-	
+
 	space = in_lock->un_member.rec_lock.space;
 	page_no = in_lock->un_member.rec_lock.page_no;
 
@@ -2199,9 +2239,10 @@ lock_rec_reset_and_release_wait(
 }	
 
 /*****************************************************************
-Makes a record to inherit the locks of another record as gap type locks, but
-does not reset the lock bits of the other record. Also waiting lock requests
-on rec are inherited as GRANTED gap locks. */
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
 
 void
 lock_rec_inherit_to_gap(
@@ -2217,9 +2258,45 @@ lock_rec_inherit_to_gap(
 	lock = lock_rec_get_first(rec);
 
 	while (lock != NULL) {
-		lock_rec_add_to_queue(((lock->type_mode | LOCK_GAP)
-					& ~LOCK_WAIT),
+		if (!lock_rec_get_insert_intention(lock)) {
+			
+			lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+						| LOCK_GAP,
 	 			     		heir, lock->index, lock->trx);
+	 	}
+	 	
+		lock = lock_rec_get_next(rec, lock);
+	}
+}	
+
+/*****************************************************************
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+	rec_t*	heir,	/* in: record which inherits */
+	rec_t*	rec)	/* in: record from which inherited; does NOT reset
+			the locks on this record */
+{
+	lock_t*	lock;
+	
+	ut_ad(mutex_own(&kernel_mutex));
+	
+	lock = lock_rec_get_first(rec);
+
+	while (lock != NULL) {
+		if (!lock_rec_get_insert_intention(lock)
+		    && (page_rec_is_supremum(rec)
+			|| !lock_rec_get_rec_not_gap(lock))) {
+			
+			lock_rec_add_to_queue(LOCK_REC | lock_get_mode(lock)
+						| LOCK_GAP,
+	 			     		heir, lock->index, lock->trx);
+	 	}
+
 		lock = lock_rec_get_next(rec, lock);
 	}
 }	
@@ -2778,9 +2855,10 @@ lock_update_insert(
 {
 	lock_mutex_enter_kernel();
 
-	/* Inherit the locks for rec, in gap mode, from the next record */
+	/* Inherit the gap-locking locks for rec, in gap mode, from the next
+	record */
 
-	lock_rec_inherit_to_gap(rec, page_rec_get_next(rec));
+	lock_rec_inherit_to_gap_if_gap_lock(rec, page_rec_get_next(rec));
 
 	lock_mutex_exit_kernel();
 }	
@@ -2859,20 +2937,23 @@ static
 ibool
 lock_deadlock_occurs(
 /*=================*/
-			/* out: TRUE if a deadlock was detected */
+			/* out: TRUE if a deadlock was detected and we
+			chose trx as a victim; FALSE if no deadlock, or
+			there was a deadlock, but we chose other
+			transaction(s) as victim(s) */
 	lock_t*	lock,	/* in: lock the transaction is requesting */
 	trx_t*	trx)	/* in: transaction */
 {
 	dict_table_t*	table;
 	dict_index_t*	index;
 	trx_t*		mark_trx;
-	ibool		ret;
+	ulint		ret;
 	ulint		cost	= 0;
 	char*		err_buf;
 
 	ut_ad(trx && lock);
 	ut_ad(mutex_own(&kernel_mutex));
-
+retry:
 	/* We check that adding this trx to the waits-for graph
 	does not produce a cycle. First mark all active transactions
 	with 0: */
@@ -2886,7 +2967,14 @@ lock_deadlock_occurs(
 
 	ret = lock_deadlock_recursive(trx, trx, lock, &cost);
 
-	if (ret) {
+	if (ret == LOCK_VICTIM_IS_OTHER) {
+		/* We chose some other trx as a victim: retry if there still
+		is a deadlock */
+
+		goto retry;
+	}
+
+	if (ret == LOCK_VICTIM_IS_START) {
 		if (lock_get_type(lock) == LOCK_TABLE) {
 			table = lock->un_member.tab_lock.table;
 			index = NULL;
@@ -2898,19 +2986,6 @@ lock_deadlock_occurs(
 		lock_deadlock_found = TRUE;
 
 		err_buf = lock_latest_err_buf + strlen(lock_latest_err_buf);
-
-		err_buf += sprintf(err_buf,
-		"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
-
-		ut_a(err_buf <= lock_latest_err_buf + 4000);
-			
-		if (lock_get_type(lock) == LOCK_REC) {
-			lock_rec_print(err_buf, lock);
-			err_buf += strlen(err_buf);
-		} else {
-			lock_table_print(err_buf, lock);
-			err_buf += strlen(err_buf);
-		}
 			
 		ut_a(err_buf <= lock_latest_err_buf + 4000);
 
@@ -2923,30 +2998,39 @@ lock_deadlock_occurs(
 		sess_raise_error_low(trx, DB_DEADLOCK, lock->type_mode, table,
 						index, NULL, NULL, NULL);
 		*/
+
+		return(TRUE);
 	}
 	
-	return(ret);
+	return(FALSE);
 }
 
 /************************************************************************
 Looks recursively for a deadlock. */
 static
-ibool
+ulint
 lock_deadlock_recursive(
 /*====================*/
-				/* out: TRUE if a deadlock was detected
-				or the calculation took too long */
+				/* out: 0 if no deadlock found,
+				LOCK_VICTIM_IS_START if there was a deadlock
+				and we chose 'start' as the victim,
+				LOCK_VICTIM_IS_OTHER if a deadlock
+				was found and we chose some other trx as a
+				victim: we must do the search again in this
+				last case because there may be another
+				deadlock! */
 	trx_t*	start,		/* in: recursion starting point */
 	trx_t*	trx,		/* in: a transaction waiting for a lock */
 	lock_t*	wait_lock,	/* in: the lock trx is waiting to be granted */
 	ulint*	cost)		/* in/out: number of calculation steps thus
 				far: if this exceeds LOCK_MAX_N_STEPS_...
-				we return TRUE */
+				we return LOCK_VICTIM_IS_START */
 {
 	lock_t*	lock;
 	ulint	bit_no;
 	trx_t*	lock_trx;
 	char*	err_buf;
+	ulint	ret;
 	
 	ut_a(trx && start && wait_lock);
 	ut_ad(mutex_own(&kernel_mutex));
@@ -2955,14 +3039,14 @@ lock_deadlock_recursive(
 		/* We have already exhaustively searched the subtree starting
 		from this trx */
 
-		return(FALSE);
+		return(0);
 	}
 
 	*cost = *cost + 1;
 
 	if (*cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK) {
 
-		return(TRUE);
+		return(LOCK_VICTIM_IS_START);
 	}
 
 	lock = wait_lock;
@@ -2998,6 +3082,9 @@ lock_deadlock_recursive(
 			lock_trx = lock->trx;
 
 			if (lock_trx == start) {
+				/* We came back to the recursion starting
+				point: a deadlock detected */
+				
 				err_buf = lock_latest_err_buf;
 
 				ut_sprintf_timestamp(err_buf);
@@ -3045,11 +3132,59 @@ lock_deadlock_recursive(
 			
 				ut_a(err_buf <= lock_latest_err_buf + 4000);
 
+				err_buf += sprintf(err_buf,
+			"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+				ut_a(err_buf <= lock_latest_err_buf + 4000);
+			
+				if (lock_get_type(start->wait_lock)
+								== LOCK_REC) {
+					lock_rec_print(err_buf,
+							start->wait_lock);
+					err_buf += strlen(err_buf);
+				} else {
+					lock_table_print(err_buf,
+							start->wait_lock);
+					err_buf += strlen(err_buf);
+				}
+
 				if (lock_print_waits) {
 					printf("Deadlock detected\n");
 				}
 
-				return(TRUE);
+				if (ut_dulint_cmp(wait_lock->trx->undo_no,
+							start->undo_no) >= 0) {
+					/* Our recursion starting point
+					transaction is 'smaller', let us
+					choose 'start' as the victim and roll
+					back it */
+
+					return(LOCK_VICTIM_IS_START);
+				}		
+
+				lock_deadlock_found = TRUE;
+
+				ut_a(err_buf <= lock_latest_err_buf + 4000);
+
+				/* Let us choose the transaction of wait_lock
+				as a victim to try to avoid deadlocking our
+				recursion starting point transaction */
+				
+				err_buf += sprintf(err_buf,
+				"*** WE ROLL BACK TRANSACTION (1)\n");
+				
+				wait_lock->trx->error_state = DB_DEADLOCK;
+				
+				lock_cancel_waiting_and_release(wait_lock);
+
+				/* Since trx and wait_lock are no longer
+				in the waits-for graph, we can return FALSE;
+				note that our selective algorithm can choose
+				several transactions as victims, but still
+				we may end up rolling back also the recursion
+				starting point transaction! */
+
+				return(LOCK_VICTIM_IS_OTHER);
 			}
 	
 			if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
@@ -3058,10 +3193,11 @@ lock_deadlock_recursive(
 				incompatible mode, and is itself waiting for
 				a lock */
 
-				if (lock_deadlock_recursive(start, lock_trx,
-						lock_trx->wait_lock, cost)) {
+				ret = lock_deadlock_recursive(start, lock_trx,
+						lock_trx->wait_lock, cost);
+				if (ret != 0) {
 
-					return(TRUE);
+					return(ret);
 				}
 			}
 		}
@@ -3153,12 +3289,16 @@ lock_table_remove_low(
 /*************************************************************************
 Enqueues a waiting request for a table lock which cannot be granted
 immediately. Checks for deadlocks. */
-
+static
 ulint
 lock_table_enqueue_waiting(
 /*=======================*/
 				/* out: DB_LOCK_WAIT, DB_DEADLOCK, or
-				DB_QUE_THR_SUSPENDED */
+				DB_QUE_THR_SUSPENDED, or DB_SUCCESS;
+				DB_SUCCESS means that there was a deadlock,
+				but another transaction was chosen as a
+				victim, and we got the lock immediately:
+				no need to wait then */
 	ulint		mode,	/* in: lock mode this transaction is
 				requesting */
 	dict_table_t*	table,	/* in: table */
@@ -3205,6 +3345,13 @@ table->name);
 		return(DB_DEADLOCK);
 	}
 
+	if (trx->wait_lock == NULL) {
+		/* Deadlock resolution chose another transaction as a victim,
+		and we accidentally got our lock granted! */
+	
+		return(DB_SUCCESS);
+	}
+	
 	trx->que_state = TRX_QUE_LOCK_WAIT;
 	trx->wait_started = time(NULL);
 
@@ -3292,7 +3439,7 @@ lock_table(
 	if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
 	
 		/* Another trx has a request on the table in an incompatible
-		mode: this trx must wait */
+		mode: this trx may have to wait */
 
 		err = lock_table_enqueue_waiting(mode, table, thr);
 			
@@ -3659,7 +3806,11 @@ lock_rec_print(
 	}
 
 	if (lock_rec_get_gap(lock)) {
-		buf += sprintf(buf, " gap type lock");
+		buf += sprintf(buf, " locks gap before rec");
+	}
+
+	if (lock_rec_get_rec_not_gap(lock)) {
+		buf += sprintf(buf, " locks rec but not gap");
 	}
 
 	if (lock_rec_get_insert_intention(lock)) {
@@ -3776,8 +3927,8 @@ lock_print_info(
 	mtr_t	mtr;
 
 	if (buf_end - buf < 600) {
-		sprintf(buf, "... output truncated!\n");
-	
+                sprintf(buf, "... output truncated!\n");
+
 		return;
 	}
 
@@ -3802,8 +3953,8 @@ lock_print_info(
 		if ((ulint)(buf_end - buf)
 			< 100 + strlen(lock_latest_err_buf)) {
 
-			lock_mutex_exit_kernel();
-			sprintf(buf, "... output truncated!\n");
+		        lock_mutex_exit_kernel();
+		        sprintf(buf, "... output truncated!\n");
 
 			return;
 		}
@@ -3826,8 +3977,8 @@ lock_print_info(
 
 	while (trx) {
 		if (buf_end - buf < 900) {
-			lock_mutex_exit_kernel();
-			sprintf(buf, "... output truncated!\n");
+		        lock_mutex_exit_kernel();
+		        sprintf(buf, "... output truncated!\n");
 
 			return;
 		}
@@ -3879,8 +4030,8 @@ loop:
 		buf += strlen(buf);
 		
 		if (buf_end - buf < 500) {
-			lock_mutex_exit_kernel();
-			sprintf(buf, "... output truncated!\n");
+		        lock_mutex_exit_kernel();
+		        sprintf(buf, "... output truncated!\n");
 
 			return;
 		}
@@ -3936,7 +4087,7 @@ loop:
 	}
 
 	if (buf_end - buf < 500) {
-		lock_mutex_exit_kernel();
+	        lock_mutex_exit_kernel();
 		sprintf(buf, "... output truncated!\n");
 
 		return;
@@ -4080,7 +4231,8 @@ lock_rec_queue_validate(
 		if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
 				LOCK_WAIT, rec, impl_trx)) {
 
-			ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx));
 		}
 	}
 
@@ -4095,7 +4247,8 @@ lock_rec_queue_validate(
 		if (impl_trx && lock_rec_other_has_expl_req(LOCK_S, 0,
 				LOCK_WAIT, rec, impl_trx)) {
 
-			ut_a(lock_rec_has_expl(LOCK_X, rec, impl_trx));
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx));
 		}
 	}
 
@@ -4359,8 +4512,8 @@ lock_rec_insert_check_and_lock(
 
 	*inherit = TRUE;
 
-	/* If another transaction has an explicit lock request, gap or not,
-	waiting or granted, on the successor, the insert has to wait.
+	/* If another transaction has an explicit lock request which locks
+	the gap, waiting or granted, on the successor, the insert has to wait.
 
 	An exception is the case where the lock by the another transaction
 	is a gap type lock which it placed to wait for its turn to insert. We
@@ -4369,8 +4522,10 @@ lock_rec_insert_check_and_lock(
 	had to wait for their insert. Both had waiting gap type lock requests
 	on the successor, which produced an unnecessary deadlock. */
 
-	if (lock_rec_other_has_conflicting(LOCK_X, LOCK_GAP,
-				LOCK_INSERT_INTENTION, next_rec, trx)) {
+	if (lock_rec_other_has_conflicting(LOCK_X | LOCK_GAP
+				| LOCK_INSERT_INTENTION, next_rec, trx)) {
+
+		/* Note that we may get DB_SUCCESS also here! */
 		err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
 						| LOCK_INSERT_INTENTION,
 						next_rec, index, thr);
@@ -4418,9 +4573,11 @@ lock_rec_convert_impl_to_expl(
 		/* If the transaction has no explicit x-lock set on the
 		record, set one for it */
 
-		if (!lock_rec_has_expl(LOCK_X, rec, impl_trx)) {
+		if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, rec,
+								impl_trx)) {
 
-			lock_rec_add_to_queue(LOCK_REC | LOCK_X, rec, index,
+			lock_rec_add_to_queue(LOCK_REC | LOCK_X
+					      | LOCK_REC_NOT_GAP, rec, index,
 								impl_trx);
 		}
 	}
@@ -4466,7 +4623,7 @@ lock_clust_rec_modify_check_and_lock(
 
 	lock_rec_convert_impl_to_expl(rec, index);
 
-	err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);
 
 	lock_mutex_exit_kernel();
 
@@ -4511,7 +4668,7 @@ lock_sec_rec_modify_check_and_lock(
 
 	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
 
-	err = lock_rec_lock(TRUE, LOCK_X, rec, index, thr);
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP, rec, index, thr);
 
 	lock_mutex_exit_kernel();
 	
@@ -4545,6 +4702,8 @@ lock_sec_rec_read_check_and_lock(
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */
 {
 	ulint	err;
@@ -4576,7 +4735,7 @@ lock_sec_rec_read_check_and_lock(
  		lock_rec_convert_impl_to_expl(rec, index);
 	}
 
-	err = lock_rec_lock(FALSE, mode, rec, index, thr);
+	err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);
 
 	lock_mutex_exit_kernel();
 
@@ -4607,13 +4766,16 @@ lock_clust_rec_read_check_and_lock(
 	ulint		mode,	/* in: mode of the lock which the read cursor
 				should set on records: LOCK_S or LOCK_X; the
 				latter is possible in SELECT FOR UPDATE */
+	ulint		gap_mode,/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */
 {
 	ulint	err;
 
 	ut_ad(index->type & DICT_CLUSTERED);
 	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
-	
+	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+					|| gap_mode == LOCK_REC_NOT_GAP);
 	if (flags & BTR_NO_LOCKING_FLAG) {
 
 		return(DB_SUCCESS);
@@ -4631,7 +4793,7 @@ lock_clust_rec_read_check_and_lock(
 		lock_rec_convert_impl_to_expl(rec, index);
 	}
 
-	err = lock_rec_lock(FALSE, mode, rec, index, thr);
+	err = lock_rec_lock(FALSE, mode | gap_mode, rec, index, thr);
 
 	lock_mutex_exit_kernel();
 
diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c
index 23585e494b8..22d0bab0da2 100644
--- a/innobase/mem/mem0dbg.c
+++ b/innobase/mem/mem0dbg.c
@@ -347,9 +347,19 @@ mem_hash_remove(
 								NULL, NULL);
 	if (error) {
 	   printf("Inconsistency in memory heap or buffer n:o %lu created\n",
-								node->nth_heap);
+							node->nth_heap);
 	   printf("in %s line %lu and tried to free in %s line %lu.\n",
 	  			node->file_name, node->line, file_name, line);
+
+	   printf(
+	   "Hex dump of 400 bytes around memory heap first block start:\n");
+
+	   ut_print_buf((byte*)(node->heap) - 200, 400);
+
+	   printf("\nDump of the mem heap:\n");
+
+	   mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size,
+								NULL, NULL);
 	   ut_error;
 	}
 
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index 098d5b25e89..9eae358c7fb 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -148,7 +148,7 @@ Gets the operating system version. Currently works only on Windows. */
 ulint
 os_get_os_version(void)
 /*===================*/
-                  /* out: OS_WIN95, OS_WIN31, OS_WINNT (2000 == NT) */
+                  /* out: OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
 {
 #ifdef __WIN__
   	OSVERSIONINFO     os_info;
@@ -162,7 +162,11 @@ os_get_os_version(void)
   	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
     		return(OS_WIN95);
   	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
-    		return(OS_WINNT);
+		if (os_info.dwMajorVersion <= 4) {
+    			return(OS_WINNT);
+    		} else {
+			return(OS_WIN2000);
+    		}
   	} else {
     		ut_error;
     		return(0);
@@ -268,9 +272,7 @@ os_file_get_last_error(void)
 }
 
 /********************************************************************
-Does error handling when a file operation fails. If we have run out
-of disk space, then the user can clean the disk. If we do not find
-a specified file, then the user can copy it to disk. */
+Does error handling when a file operation fails. */
 static
 ibool
 os_file_handle_error(
@@ -503,7 +505,11 @@ try_again:
 		        value 2 denotes that we do not flush the log at every
 		        commit, but only once per second */
 		} else {
-		        attributes = attributes | FILE_FLAG_NO_BUFFERING;
+			if (srv_win_file_flush_method ==
+					SRV_WIN_IO_UNBUFFERED) {
+		        	attributes = attributes
+						| FILE_FLAG_NO_BUFFERING;
+			}
 		}
 #endif
 	} else if (purpose == OS_FILE_NORMAL) {
@@ -514,7 +520,11 @@ try_again:
 		        value 2 denotes that we do not flush the log at every
 		        commit, but only once per second */
 		} else {
-		        attributes = attributes | FILE_FLAG_NO_BUFFERING;
+			if (srv_win_file_flush_method ==
+					SRV_WIN_IO_UNBUFFERED) {
+		        	attributes = attributes
+						| FILE_FLAG_NO_BUFFERING;
+			}
 		}
 #endif
 	} else {
@@ -1752,6 +1762,7 @@ os_aio(
 	os_aio_array_t*	array;
 	os_aio_slot_t*	slot;
 #ifdef WIN_ASYNC_IO
+	ibool		retval;
 	BOOL		ret		= TRUE;
 	DWORD		len		= n;
 	void*		dummy_mess1;
@@ -1824,6 +1835,8 @@ try_again:
 		if (os_aio_use_native_aio) {
 #ifdef WIN_ASYNC_IO
 			os_n_file_reads++;
+			os_bytes_read_since_printout += len;
+			
 			ret = ReadFile(file, buf, (DWORD)n, &len,
 							&(slot->control));
 #elif defined(POSIX_ASYNC_IO)
@@ -1870,10 +1883,12 @@ try_again:
 	    		    where we also use async i/o: in Windows we must
 	    		    use the same wait mechanism as for async i/o */
 	    		
-	    		    return(os_aio_windows_handle(ULINT_UNDEFINED,
+	    		    retval = os_aio_windows_handle(ULINT_UNDEFINED,
 					slot->pos,
 		    			&dummy_mess1, &dummy_mess2,
-					&dummy_type));
+					&dummy_type);
+
+			    return(retval);
 	    		}
 
 			return(TRUE);
@@ -1897,8 +1912,6 @@ try_again:
 		goto try_again;
 	}	
 
-	ut_error;
-	
 	return(FALSE);
 }
 
@@ -1958,14 +1971,14 @@ os_aio_windows_handle(
 	n = array->n_slots / array->n_segments;
 
 	if (array == os_aio_sync_array) {
-		srv_io_thread_op_info[orig_seg] = "wait windows aio for 1 page";
+		srv_io_thread_op_info[orig_seg] = "wait Windows aio for 1 page";
 
 		ut_ad(pos < array->n_slots); 
 		os_event_wait(array->events[pos]);
 		i = pos;
 	} else {
 		srv_io_thread_op_info[orig_seg] =
-						"wait windows aio for n pages";
+						"wait Windows aio";
 		i = os_event_wait_multiple(n, (array->events) + segment * n);
 	}
 
@@ -1991,10 +2004,8 @@ os_aio_windows_handle(
 		         ut_a(TRUE == os_file_flush(slot->file));
 		}
 	} else {
-		os_file_get_last_error();
-
-		ut_error;
-
+		os_file_handle_error(slot->file, slot->name);
+		
 		ret_val = FALSE;
 	}		  
 
diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c
index 43a2db4d306..1ee448a4a44 100644
--- a/innobase/os/os0proc.c
+++ b/innobase/os/os0proc.c
@@ -19,6 +19,23 @@ Created 9/30/1995 Heikki Tuuri
 #include "ut0mem.h"
 
 /********************************************************************
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'. */
+
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+	return((ulint)GetCurrentProcessId());
+#else
+	return((ulint)getpid());
+#endif
+}
+
+/********************************************************************
 Allocates non-cacheable memory. */
 
 void*
diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c
index 2909573b14b..bb49e9080ce 100644
--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@@ -169,7 +169,7 @@ page_cur_search_with_match(
 	ut_ad(dtuple_check_typed(tuple));
 	ut_ad((mode == PAGE_CUR_L) || (mode == PAGE_CUR_LE)
 	      || (mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)
-	      || (mode == PAGE_CUR_DBG));
+	      || (mode == PAGE_CUR_LE_OR_EXTENDS) || (mode == PAGE_CUR_DBG));
 	      
 #ifdef PAGE_CUR_ADAPT
 	if ((page_header_get_field(page, PAGE_LEVEL) == 0)
@@ -232,9 +232,26 @@ page_cur_search_with_match(
 			low_matched_bytes = cur_matched_bytes;
 
 		} else if (cmp == -1) {
-			up = mid;
-			up_matched_fields = cur_matched_fields;
-			up_matched_bytes = cur_matched_bytes; 
+
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && dfield_get_len(dtuple_get_nth_field(tuple,
+			    				cur_matched_fields))
+			    	== cur_matched_bytes
+			    && rec_get_nth_field_len(mid_rec,
+							cur_matched_fields)
+				!= UNIV_SQL_NULL) {
+
+				/* This means current dfield is not SQL
+			    	NULL, and the current rec field extends it */
+
+				low = mid;
+				low_matched_fields = cur_matched_fields;
+				low_matched_bytes = cur_matched_bytes;
+			} else {
+				up = mid;
+				up_matched_fields = cur_matched_fields;
+				up_matched_bytes = cur_matched_bytes;
+			}
 
 		} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
 			low = mid;
@@ -252,8 +269,8 @@ page_cur_search_with_match(
 	slot = page_dir_get_nth_slot(page, up);
 	up_rec = page_dir_slot_get_rec(slot);
 
-	/* Perform linear search until the upper and lower records
-	come to distance 1 of each other. */
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */
 
    	while (page_rec_get_next(low_rec) != up_rec) {
 
@@ -272,10 +289,25 @@ page_cur_search_with_match(
 			low_matched_bytes = cur_matched_bytes;
 
 		} else if (cmp == -1) {
-			up_rec = mid_rec;
-			up_matched_fields = cur_matched_fields;
-			up_matched_bytes = cur_matched_bytes; 
-
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && dfield_get_len(dtuple_get_nth_field(tuple,
+			    				cur_matched_fields))
+			    	== cur_matched_bytes
+			    && rec_get_nth_field_len(mid_rec,
+							cur_matched_fields)
+				!= UNIV_SQL_NULL) {
+
+				/* This means current dfield is not SQL
+			    	NULL, and the current rec field extends it */
+
+				low = mid;
+				low_matched_fields = cur_matched_fields;
+				low_matched_bytes = cur_matched_bytes;
+			} else {
+				up_rec = mid_rec;
+				up_matched_fields = cur_matched_fields;
+				up_matched_bytes = cur_matched_bytes;
+			}
 		} else if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_LE)) {
 			low_rec = mid_rec;
 			low_matched_fields = cur_matched_fields;
diff --git a/innobase/page/page0page.c b/innobase/page/page0page.c
index ed74736c8da..7d0d88c6afc 100644
--- a/innobase/page/page0page.c
+++ b/innobase/page/page0page.c
@@ -1313,6 +1313,194 @@ page_rec_validate(
 }
 	
 /*******************************************************************
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage. */
+
+ibool
+page_simple_validate(
+/*=================*/
+			/* out: TRUE if ok */
+	page_t*	page)	/* in: index page */
+{
+	page_cur_t 	cur;
+	page_dir_slot_t* slot;
+	ulint		slot_no;
+	ulint		n_slots;
+	rec_t*		rec;
+	byte*		rec_heap_top;
+	ulint		count;
+	ulint		own_count;
+	ibool		ret	= FALSE;
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (n_slots > UNIV_PAGE_SIZE / 4) {
+		fprintf(stderr,
+	"Nonsensical number %lu of page dir slots\n", n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+	
+	if (rec_heap_top > page_dir_get_nth_slot(page, n_slots - 1)) {
+
+		fprintf(stderr,
+       	"Record heap and dir overlap on a page, heap top %lu, dir %lu\n",
+       		(ulint)(page_header_get_ptr(page, PAGE_HEAP_TOP) - page),
+       		(ulint)(page_dir_get_nth_slot(page, n_slots - 1) - page));
+
+       		goto func_exit;
+       	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	page_cur_set_before_first(page, &cur);
+
+	for (;;) {
+		rec = (&cur)->rec;
+		
+		if (rec > rec_heap_top) {
+			fprintf(stderr,
+			"Record %lu is above rec heap top %lu\n",
+			(ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		if (rec_get_n_owned(rec) != 0) {
+			/* This is a record pointed to by a dir slot */
+			if (rec_get_n_owned(rec) != own_count) {
+
+				fprintf(stderr,
+				"Wrong owned count %lu, %lu, rec %lu\n",
+				rec_get_n_owned(rec), own_count,
+				(ulint)(rec - page));
+
+				goto func_exit;
+			}
+
+			if (page_dir_slot_get_rec(slot) != rec) {
+				fprintf(stderr,
+				"Dir slot does not point to right rec %lu\n",
+					(ulint)(rec - page));
+
+				goto func_exit;
+			}
+						
+			own_count = 0;
+
+			if (!page_cur_is_after_last(&cur)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_cur_is_after_last(&cur)) {
+
+			break;
+		}
+
+		if (rec_get_next_offs(rec) < FIL_PAGE_DATA
+				|| rec_get_next_offs(rec) >= UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			  "Next record offset nonsensical %lu for rec %lu\n",
+			  rec_get_next_offs(rec),
+			  (ulint)(rec - page));
+
+			goto func_exit;
+		}
+
+		count++;		
+
+		if (count > UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			"Page record list appears to be circular %lu\n",
+								count);
+			goto func_exit;
+		}
+		
+		page_cur_move_to_next(&cur);
+		own_count++;
+	}
+	
+	if (rec_get_n_owned(rec) == 0) {
+		fprintf(stderr, "n owned is zero in a supremum rec\n");
+
+		goto func_exit;
+	}
+		
+	if (slot_no != n_slots - 1) {
+		fprintf(stderr, "n slots wrong %lu, %lu\n",
+			slot_no, n_slots - 1);
+		goto func_exit;
+	}		
+
+	if (page_header_get_field(page, PAGE_N_RECS) + 2 != count + 1) {
+		fprintf(stderr, "n recs wrong %lu %lu\n",
+		page_header_get_field(page, PAGE_N_RECS) + 2,  count + 1);
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (rec < page + FIL_PAGE_DATA
+				|| rec >= page + UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+		"Free list record has a nonsensical offset %lu\n",
+			(ulint)(rec - page));
+
+			goto func_exit;
+		}
+
+		if (rec > rec_heap_top) {
+			fprintf(stderr,
+			"Free list record %lu is above rec heap top %lu\n",
+			(ulint)(rec - page), (ulint)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		count++;
+		
+		if (count > UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+			"Page free list appears to be circular %lu\n",
+								count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+	}
+	
+	if (page_header_get_field(page, PAGE_N_HEAP) != count + 1) {
+
+		fprintf(stderr, "N heap is wrong %lu, %lu\n",
+		page_header_get_field(page, PAGE_N_HEAP), count + 1);
+
+		goto func_exit;
+	}
+
+	ret = TRUE;	
+
+func_exit:
+	return(ret);			  
+}
+
+/*******************************************************************
 This function checks the consistency of an index page. */
 
 ibool
@@ -1339,6 +1527,14 @@ page_validate(
 	ulint		i;
 	char           	err_buf[1000];
 	
+	if (!page_simple_validate(page)) {
+		buf_page_print(page);
+
+		fprintf(stderr, "Apparent corruption in a page in index %s\n",
+								index->name);
+		return(FALSE);
+	}
+
 	heap = mem_heap_create(UNIV_PAGE_SIZE);
 	
 	/* The following buffer is used to check that the
diff --git a/innobase/pars/lexyy.c b/innobase/pars/lexyy.c
index 782fca35f66..f7edc9d195f 100644
--- a/innobase/pars/lexyy.c
+++ b/innobase/pars/lexyy.c
@@ -4,8 +4,6 @@
  * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $
  */
 
-#include "univ.i"
-
 #define FLEX_SCANNER
 #define YY_FLEX_MAJOR_VERSION 2
 #define YY_FLEX_MINOR_VERSION 5
@@ -609,18 +607,13 @@ How to make the InnoDB parser and lexer C files:
 
 6. Remove the #include of unistd.h from about line 2500 of lexyy.c
 
-7. Move #include <math.h> in pars0grm.c after #include "univ.i" to remove
-   a large file compilation error on AIX.
-
-8. Move #include "univ.i" in lexyy.c to the file start to remove a large
-   file compilation error on AIX.
-
 These instructions seem to work at least with bison-1.28 and flex-2.5.4 on
 Linux.
 *******************************************************/
 #line 36 "pars0lex.l"
 #define YYSTYPE que_node_t*
 
+#include "univ.i"
 #include "pars0pars.h"
 #include "pars0grm.h"
 #include "pars0sym.h"
diff --git a/innobase/pars/pars0grm.c b/innobase/pars/pars0grm.c
index ce575063610..05b75398084 100644
--- a/innobase/pars/pars0grm.c
+++ b/innobase/pars/pars0grm.c
@@ -102,8 +102,6 @@ que_node_t */
 #include "que0que.h"
 #include "row0sel.h"
 
-#include <math.h>
-
 #define YYSTYPE que_node_t*
 
 /* #define __STDC__ */
diff --git a/innobase/read/read0read.c b/innobase/read/read0read.c
index a5048c0c909..5c1d2d5418e 100644
--- a/innobase/read/read0read.c
+++ b/innobase/read/read0read.c
@@ -201,6 +201,28 @@ read_view_close(
 } 
 
 /*************************************************************************
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx)	/* in: trx which has a read view */
+{
+	ut_a(trx->read_view);
+
+	mutex_enter(&kernel_mutex);
+
+	read_view_close(trx->read_view);
+
+	mem_heap_empty(trx->read_view_heap);
+
+	trx->read_view = NULL;
+
+	mutex_exit(&kernel_mutex);
+}
+	
+/*************************************************************************
 Prints a read view to stderr. */
 
 void
diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c
index 941c9d5759d..4e8b487a0f1 100644
--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -321,59 +321,6 @@ row_ins_clust_index_entry_by_modify(
 	return(err);
 }
 
-/*******************************************************************
-Checks if a unique key violation to rec would occur at the index entry
-insert. */
-static
-ibool
-row_ins_dupl_error_with_rec(
-/*========================*/
-				/* out: TRUE if error */
-	rec_t*		rec,	/* in: user record; NOTE that we assume
-				that the caller already has a record lock on
-				the record! */
-	dtuple_t*	entry,	/* in: entry to insert */
-	dict_index_t*	index)	/* in: index */
-{
-	ulint	matched_fields;
-	ulint	matched_bytes;
-	ulint	n_unique;
-	ulint   i;
-	
-	n_unique = dict_index_get_n_unique(index);
-
-	matched_fields = 0;
-	matched_bytes = 0;
-
-	cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
-
-	if (matched_fields < n_unique) {
-
-	        return(FALSE);
-	}
-
-	/* In a unique secondary index we allow equal key values if they
-	contain SQL NULLs */
-
-	if (!(index->type & DICT_CLUSTERED)) {
-
-	        for (i = 0; i < n_unique; i++) {
-	                if (UNIV_SQL_NULL == dfield_get_len(
-                                         dtuple_get_nth_field(entry, i))) {
-
-	                        return(FALSE);
-	                }
-	        }
-	}
-
-	if (!rec_get_deleted_flag(rec)) {
-
-	        return(TRUE);
-	}
-
-	return(FALSE);
-}	
-
 /*************************************************************************
 Either deletes or sets the referencing columns SQL NULL in a child row.
 Used in ON DELETE ... clause for foreign keys when a parent row is
@@ -533,8 +480,12 @@ row_ins_foreign_delete_or_set_null(
 	err = lock_table(0, table, LOCK_IX, thr);
 
 	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
-						clust_index, LOCK_X, thr);
+				clust_index, LOCK_X, LOCK_REC_NOT_GAP, thr);
 	}
 	
 	if (err != DB_SUCCESS) {
@@ -630,12 +581,14 @@ nonstandard_exit_func:
 
 /*************************************************************************
 Sets a shared lock on a record. Used in locking possible duplicate key
-records. */
+records and also in checking foreign key constraints. */
 static
 ulint
 row_ins_set_shared_rec_lock(
 /*========================*/
 				/* out: DB_SUCCESS or error code */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or
+				LOCK_REC_NOT_GAP type lock */
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index */
 	que_thr_t*	thr)	/* in: query thread */	
@@ -644,10 +597,10 @@ row_ins_set_shared_rec_lock(
 
 	if (index->type & DICT_CLUSTERED) {
 		err = lock_clust_rec_read_check_and_lock(0, rec, index, LOCK_S,
-									thr);
+								type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(0, rec, index, LOCK_S,
-									thr);
+								type, thr);
 	}
 
 	return(err);
@@ -656,7 +609,7 @@ row_ins_set_shared_rec_lock(
 /*******************************************************************
 Checks if foreign key constraint fails for an index entry. Sets shared locks
 which lock either the success or the failure of the constraint. NOTE that
-the caller must have a shared latch on dict_foreign_key_check_lock. */
+the caller must have a shared latch on dict_operation_lock. */
 
 ulint
 row_ins_check_foreign_constraint(
@@ -679,7 +632,7 @@ row_ins_check_foreign_constraint(
 	dict_table_t*	check_table;
 	dict_index_t*	check_index;
 	ulint		n_fields_cmp;
-	ibool           timeout_expired;
+	ibool		unique_search;
 	rec_t*		rec;
 	btr_pcur_t	pcur;
 	ibool		moved;
@@ -689,7 +642,9 @@ row_ins_check_foreign_constraint(
 	mtr_t		mtr;
 
 run_again:
-	ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+
+	err = DB_SUCCESS;
 
 	if (thr_get_trx(thr)->check_foreigns == FALSE) {
 		/* The user has suppressed foreign key checks currently for
@@ -748,6 +703,14 @@ run_again:
 
 	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
 
+	if (dict_index_get_n_unique(check_index) <= foreign->n_fields) {
+		/* We can just set a LOCK_REC_NOT_GAP type lock */
+	
+		unique_search = TRUE;
+	} else {
+		unique_search = FALSE;
+	}
+
 	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
 					BTR_SEARCH_LEAF, &pcur, &mtr);
 
@@ -761,25 +724,45 @@ run_again:
 			goto next_rec;
 		}
 		
-		/* Try to place a lock on the index record */
-
-		err = row_ins_set_shared_rec_lock(rec, check_index, thr);
-
-		if (err != DB_SUCCESS) {
-
-			break;
-		}
-
 		if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
 		
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec,
+							check_index, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
 			goto next_rec;
 		}
 
 		cmp = cmp_dtuple_rec(entry, rec);
 
 		if (cmp == 0) {
-			if (!rec_get_deleted_flag(rec)) {
+			if (rec_get_deleted_flag(rec)) {
+				err = row_ins_set_shared_rec_lock(LOCK_ORDINARY,
+							rec, check_index, thr);
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
+			} else {
 				/* Found a matching record */
+				
+				if (unique_search) {
+					err = row_ins_set_shared_rec_lock(
+							LOCK_REC_NOT_GAP,
+							rec, check_index, thr);
+				} else {
+					err = row_ins_set_shared_rec_lock(
+							LOCK_ORDINARY,
+							rec, check_index, thr);
+				}
+				
+				if (err != DB_SUCCESS) {
+
+					break;
+				}
 
 /*				printf(
 "FOREIGN: Found matching record from %s %s\n",
@@ -807,6 +790,13 @@ run_again:
 		}
 
 		if (cmp < 0) {
+			err = row_ins_set_shared_rec_lock(LOCK_GAP,
+						rec, check_index, thr);
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+
 			if (check_ref) {			
 				err = DB_NO_REFERENCED_ROW;
 			} else {
@@ -844,14 +834,14 @@ do_possible_lock_wait:
 
 		que_thr_stop_for_mysql(thr);
 
-		timeout_expired = srv_suspend_mysql_thread(thr);
+		srv_suspend_mysql_thread(thr);
 	
-		if (!timeout_expired) {
+		if (thr_get_trx(thr)->error_state == DB_SUCCESS) {
 
 		        goto run_again;
 		}
 
-		err = DB_LOCK_WAIT_TIMEOUT;
+		err = thr_get_trx(thr)->error_state;
 	}
 
 	return(err);
@@ -890,21 +880,21 @@ row_ins_check_foreign_constraints(
 									trx);
 			}
 
-			if (!trx->has_dict_foreign_key_check_lock) {
+			if (!trx->has_dict_operation_lock) {
 				got_s_lock = TRUE;
 
-				rw_lock_s_lock(&dict_foreign_key_check_lock);
+				rw_lock_s_lock(&dict_operation_lock);
 
-				trx->has_dict_foreign_key_check_lock = TRUE;
+				trx->has_dict_operation_lock = TRUE;
 			}
 
 			err = row_ins_check_foreign_constraint(TRUE, foreign,
 						table, index, entry, thr);
 			if (got_s_lock) {
 
-				rw_lock_s_unlock(&dict_foreign_key_check_lock);	
+				rw_lock_s_unlock(&dict_operation_lock);	
 
-				trx->has_dict_foreign_key_check_lock = FALSE;
+				trx->has_dict_operation_lock = FALSE;
 			}
 				
 			if (err != DB_SUCCESS) {
@@ -919,6 +909,59 @@ row_ins_check_foreign_constraints(
 }
 
 /*******************************************************************
+Checks if a unique key violation to rec would occur at the index entry
+insert. */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+				/* out: TRUE if error */
+	rec_t*		rec,	/* in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	dtuple_t*	entry,	/* in: entry to insert */
+	dict_index_t*	index)	/* in: index */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint   i;
+	
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+	        return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!(index->type & DICT_CLUSTERED)) {
+
+	        for (i = 0; i < n_unique; i++) {
+	                if (UNIV_SQL_NULL == dfield_get_len(
+                                         dtuple_get_nth_field(entry, i))) {
+
+	                        return(FALSE);
+	                }
+	        }
+	}
+
+	if (!rec_get_deleted_flag(rec)) {
+
+	        return(TRUE);
+	}
+
+	return(FALSE);
+}	
+
+/*******************************************************************
 Scans a unique non-clustered index at a given index entry to determine
 whether a uniqueness violation has occurred for the key value of the entry.
 Set shared locks on possible duplicate records. */
@@ -976,9 +1019,10 @@ row_ins_scan_sec_index_for_duplicate(
 			goto next_rec;
 		}
 				
-		/* Try to place a lock on the index record */	
+		/* Try to place a lock on the index record */
 
-		err = row_ins_set_shared_rec_lock(rec, index, thr);
+		err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, rec, index,
+									thr);
 
 		if (err != DB_SUCCESS) {
 
@@ -1082,8 +1126,8 @@ row_ins_duplicate_error_in_clust(
 			sure that in roll-forward we get the same duplicate
 			errors as in original execution */
 		
-			err = row_ins_set_shared_rec_lock(rec, cursor->index,
-									thr);
+			err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+						rec, cursor->index, thr);
 			if (err != DB_SUCCESS) {
 					
 				return(err);
@@ -1105,8 +1149,8 @@ row_ins_duplicate_error_in_clust(
 
 		if (rec != page_get_supremum_rec(page)) {
 
-			err = row_ins_set_shared_rec_lock(rec, cursor->index,
-									thr);
+			err = row_ins_set_shared_rec_lock(LOCK_REC_NOT_GAP,
+						rec, cursor->index, thr);
 			if (err != DB_SUCCESS) {
 					
 				return(err);
diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c
index cea8f1316fe..6fde57eb75a 100644
--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -27,6 +27,7 @@ Created 9/17/2000 Heikki Tuuri
 #include "lock0lock.h"
 #include "rem0cmp.h"
 #include "log0log.h"
+#include "btr0sea.h"
 
 /* A dummy variable used to fool the compiler */
 ibool	row_mysql_identically_false	= FALSE;
@@ -203,7 +204,6 @@ row_mysql_handle_errors(
 	que_thr_t*	thr,	/* in: query thread */
 	trx_savept_t*	savept)	/* in: savepoint or NULL */
 {
-	ibool	timeout_expired;
 	ulint	err;
 
 handle_new_error:
@@ -240,11 +240,9 @@ handle_new_error:
 		/* MySQL will roll back the latest SQL statement */
 	} else if (err == DB_LOCK_WAIT) {
 
-		timeout_expired = srv_suspend_mysql_thread(thr);
-
-		if (timeout_expired) {
-			trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+		srv_suspend_mysql_thread(thr);
 
+		if (trx->error_state != DB_SUCCESS) {
 			que_thr_stop_for_mysql(thr);
 
 			goto handle_new_error;
@@ -1146,7 +1144,7 @@ row_mysql_lock_data_dictionary(void)
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks or lock waits can occur then in these operations */
 
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));
 }
 
@@ -1161,7 +1159,7 @@ row_mysql_unlock_data_dictionary(void)
 	no deadlocks can occur then in these operations */
 
 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);
 }
 
 /*************************************************************************
@@ -1184,6 +1182,7 @@ row_create_table_for_mysql(
 	ulint		err;
 
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	
 	if (srv_created_new_raw) {
@@ -1383,7 +1382,8 @@ row_create_index_for_mysql(
 	ulint		namelen;
 	ulint		keywordlen;
 	ulint		err;
-	
+
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
 	
@@ -1464,6 +1464,7 @@ row_table_add_foreign_constraints(
 	ulint	err;
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 	ut_a(sql_string);
 	
 	trx->op_info = (char *) "adding foreign keys";
@@ -1846,12 +1847,16 @@ row_drop_table_for_mysql(
 	no deadlocks can occur then in these operations */
 
 	if (!has_dict_mutex) {
-		/* Prevent foreign key checks while we are dropping the table */
-		rw_lock_x_lock(&(dict_foreign_key_check_lock));
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+		rw_lock_x_lock(&dict_operation_lock);
 
 		mutex_enter(&(dict_sys->mutex));
 	}
 
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	
 	graph = pars_sql(buf);
 
 	ut_a(graph);
@@ -1861,9 +1866,6 @@ row_drop_table_for_mysql(
 
 	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
 
-	/* Prevent purge from running while we are dropping the table */
-	rw_lock_s_lock(&(purge_sys->purge_is_running));
-
 	table = dict_table_get_low(name);
 
 	if (!table) {
@@ -1944,12 +1946,11 @@ row_drop_table_for_mysql(
 
 		}
 	}
-funct_exit:	
-	rw_lock_s_unlock(&(purge_sys->purge_is_running));
+funct_exit:
 
 	if (!has_dict_mutex) {
 		mutex_exit(&(dict_sys->mutex));
-		rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+		rw_lock_x_unlock(&dict_operation_lock);
 	}
 
 	que_graph_free(graph);
@@ -1985,7 +1986,7 @@ row_drop_database_for_mysql(
 	
 	trx_start_if_not_started(trx);
 loop:
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));
 
 	while ((table_name = dict_get_first_table_name_in_db(name))) {
@@ -2000,7 +2001,7 @@ loop:
 
 		if (table->n_mysql_handles_opened > 0) {
 		        mutex_exit(&(dict_sys->mutex));
-			rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+			rw_lock_x_unlock(&dict_operation_lock);
 
 			ut_print_timestamp(stderr);
 			fprintf(stderr,
@@ -2028,7 +2029,7 @@ loop:
 	}
 
 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);
 	
 	trx_commit_for_mysql(trx);
 
@@ -2165,7 +2166,7 @@ row_rename_table_for_mysql(
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	rw_lock_x_lock(&dict_operation_lock);
 	mutex_enter(&(dict_sys->mutex));
 
 	table = dict_table_get_low(old_name);
@@ -2249,7 +2250,7 @@ row_rename_table_for_mysql(
 	}
 funct_exit:	
 	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+	rw_lock_x_unlock(&dict_operation_lock);
 
 	que_graph_free(graph);
 	
@@ -2394,18 +2395,28 @@ row_check_table_for_mysql(
 	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
 					handle */
 {
-	dict_table_t*	table	= prebuilt->table;
+	dict_table_t*	table		= prebuilt->table;
 	dict_index_t*	index;
 	ulint		n_rows;
 	ulint		n_rows_in_table	= ULINT_UNDEFINED;
-	ulint		ret 	= DB_SUCCESS;
-
+	ulint		ret 		= DB_SUCCESS;
+	ulint		old_isolation_level;
+	
 	prebuilt->trx->op_info = (char *) "checking table";
 
+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+	
 	index = dict_table_get_first_index(table);
 
 	while (index != NULL) {
-      /*        fprintf(stderr, "Validating index %s\n", index->name); */
+      		/* fprintf(stderr, "Validating index %s\n", index->name); */
 	
 		if (!btr_validate_tree(index->tree)) {
 			ret = DB_ERROR;
@@ -2433,6 +2444,9 @@ row_check_table_for_mysql(
 		index = dict_table_get_next_index(index);
 	}
 
+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+	
 	/* We validate also the whole adaptive hash index for all tables
 	at every CHECK TABLE */
 
diff --git a/innobase/row/row0purge.c b/innobase/row/row0purge.c
index 60e057b816e..3d9ae6aad8b 100644
--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@@ -453,7 +453,9 @@ static
 ibool
 row_purge_parse_undo_rec(
 /*=====================*/
-				/* out: TRUE if purge operation required */
+				/* out: TRUE if purge operation required:
+				NOTE that then the CALLER must s-unlock
+				dict_operation_lock! */
 	purge_node_t*	node,	/* in: row undo node */
 	ibool*		updated_extern,
 				/* out: TRUE if an externally stored field
@@ -493,18 +495,20 @@ row_purge_parse_undo_rec(
 	    	return(FALSE);
 	}
 	
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	rw_lock_s_lock(&dict_operation_lock);
  	mutex_enter(&(dict_sys->mutex));
 
 	node->table = dict_table_get_on_id_low(table_id, thr_get_trx(thr));
 
-	rw_lock_x_lock(&(purge_sys->purge_is_running));
-
  	mutex_exit(&(dict_sys->mutex));
 	
 	if (node->table == NULL) {
 		/* The table has been dropped: no need to do purge */
 
-		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+		rw_lock_s_unlock(&dict_operation_lock);
 
 		return(FALSE);
 	}
@@ -514,7 +518,7 @@ row_purge_parse_undo_rec(
 	if (clust_index == NULL) {
 		/* The table was corrupt in the data dictionary */
 
-		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+		rw_lock_s_unlock(&dict_operation_lock);
 
 		return(FALSE);
 	}
@@ -573,6 +577,8 @@ row_purge(
 	} else {
 		purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
 									thr);
+		/* If purge_needed == TRUE, we must also remember to unlock
+		dict_operation_lock! */
 	}
 
 	if (purge_needed) {
@@ -594,7 +600,7 @@ row_purge(
 			btr_pcur_close(&(node->pcur));
 		}
 
-		rw_lock_x_unlock(&(purge_sys->purge_is_running));		
+		rw_lock_s_unlock(&dict_operation_lock);		
 	}
 
 	/* Do some cleanup */
diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c
index 4af04251996..fcf48dd15cf 100644
--- a/innobase/row/row0sel.c
+++ b/innobase/row/row0sel.c
@@ -606,7 +606,7 @@ row_sel_get_clust_rec(
 		/* Try to place a lock on the index record */
 		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec, index,
-						node->row_lock_mode, thr);
+					node->row_lock_mode, LOCK_ORDINARY, thr);
 		if (err != DB_SUCCESS) {
 
 			return(err);
@@ -621,7 +621,7 @@ row_sel_get_clust_rec(
 							node->read_view)) {
 
 			err = row_sel_build_prev_vers(node->read_view, plan,
-						clust_rec, &old_vers, mtr);
+					clust_rec, &old_vers, mtr);
 			if (err != DB_SUCCESS) {
 
 				return(err);
@@ -678,16 +678,17 @@ sel_set_rec_lock(
 	rec_t*		rec,	/* in: record */
 	dict_index_t*	index,	/* in: index */
 	ulint		mode,	/* in: lock mode */
+	ulint		type, 	/* in: LOCK_ORDINARY, LOCK_GAP, or LOC_REC_NOT_GAP */
 	que_thr_t*	thr)	/* in: query thread */	
 {
 	ulint	err;
 
 	if (index->type & DICT_CLUSTERED) {
 		err = lock_clust_rec_read_check_and_lock(0, rec, index, mode,
-									thr);
+							type, thr);
 	} else {
 		err = lock_sec_rec_read_check_and_lock(0, rec, index, mode,
-									thr);
+							type, thr);
 	}
 
 	return(err);
@@ -1154,7 +1155,7 @@ rec_loop:
 		
 		if (!consistent_read) {
 			err = sel_set_rec_lock(page_rec_get_next(rec), index,
-						node->row_lock_mode, thr);
+				node->row_lock_mode, LOCK_ORDINARY, thr);
 			if (err != DB_SUCCESS) {
 				/* Note that in this case we will store in pcur
 				the PREDECESSOR of the record we are waiting
@@ -1180,8 +1181,8 @@ rec_loop:
 	if (!consistent_read) {
 		/* Try to place a lock on the index record */	
 
-		err = sel_set_rec_lock(rec, index, node->row_lock_mode, thr);
-
+		err = sel_set_rec_lock(rec, index, node->row_lock_mode,
+						LOCK_ORDINARY, thr);
 		if (err != DB_SUCCESS) {
 
 			goto lock_wait_or_error;
@@ -2200,6 +2201,7 @@ row_sel_get_clust_rec_for_mysql(
 	rec_t*		old_vers;
 	ulint		err;
 	trx_t*		trx;
+	char		err_buf[1000];
 
 	*out_rec = NULL;
 	
@@ -2213,14 +2215,40 @@ row_sel_get_clust_rec_for_mysql(
 
 	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
 
-	ut_ad(page_rec_is_user_rec(clust_rec));
+	if (!page_rec_is_user_rec(clust_rec)) {
+		ut_print_timestamp(stderr);
+	  	fprintf(stderr,
+		"  InnoDB: error clustered record for sec rec not found\n"
+		"InnoDB: index %s table %s\n", sec_index->name,
+		  	sec_index->table->name);
+
+	  	rec_sprintf(err_buf, 900, rec);
+	  	fprintf(stderr, "InnoDB: sec index record %s\n", err_buf);
+
+	  	rec_sprintf(err_buf, 900, clust_rec);
+	  	fprintf(stderr, "InnoDB: clust index record %s\n", err_buf);
+
+		trx_print(err_buf, trx);
+
+	  	fprintf(stderr,
+		"%s\nInnoDB: Make a detailed bug report and send it\n",
+							err_buf);
+	  	fprintf(stderr, "InnoDB: to mysql@lists.mysql.com\n");
+
+		clust_rec = NULL;
+
+		goto func_exit;
+	}
 
 	if (prebuilt->select_lock_type != LOCK_NONE) {
-		/* Try to place a lock on the index record */
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
 		
 		err = lock_clust_rec_read_check_and_lock(0, clust_rec,
 					clust_index,
-					prebuilt->select_lock_type, thr);
+					prebuilt->select_lock_type,
+					LOCK_REC_NOT_GAP, thr);
 		if (err != DB_SUCCESS) {
 
 			return(err);
@@ -2232,8 +2260,12 @@ row_sel_get_clust_rec_for_mysql(
 		trx = thr_get_trx(thr);
 
 		old_vers = NULL;
-		
-		if (!lock_clust_rec_cons_read_sees(clust_rec, clust_index,
+
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(clust_rec, clust_index,
 							trx->read_view)) {
 
 			err = row_sel_build_prev_vers_for_mysql(
@@ -2275,6 +2307,7 @@ row_sel_get_clust_rec_for_mysql(
 		}
 	}
 
+func_exit:
 	*out_rec = clust_rec;
 
 	if (prebuilt->select_lock_type == LOCK_X) {
@@ -2407,7 +2440,7 @@ row_sel_push_cache_row_for_mysql(
 /*************************************************************************
 Tries to do a shortcut to fetch a clustered index record with a unique key,
 using the hash index if possible (not always). We assume that the search
-mode is PAGE_CUR_GE, it is a consistent read, trx has already a read view,
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
 btr search latch has been locked in S-mode. */
 static
 ulint
@@ -2426,7 +2459,7 @@ row_sel_try_search_shortcut_for_mysql(
 	
 	ut_ad(index->type & DICT_CLUSTERED);
 	ut_ad(!prebuilt->templ_contains_blob);
-
+	
 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
 					BTR_SEARCH_LEAF, pcur,
 #ifndef UNIV_SEARCH_DEBUG
@@ -2516,17 +2549,22 @@ row_search_for_mysql(
 	ibool		was_lock_wait;
 	ulint		ret;
 	ulint		shortcut;
+	ibool		unique_search			= FALSE;
 	ibool		unique_search_from_clust_index	= FALSE;
 	ibool		mtr_has_extra_clust_latch 	= FALSE;
 	ibool		moves_up 			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+					/* if the query is a plain
+					locking SELECT, and the isolation
+					level is <= TRX_ISO_READ_COMMITTED,
+					then this is set to FALSE */
+	ibool		success;
 	ulint		cnt				= 0;
 	mtr_t		mtr;
 	
 	ut_ad(index && pcur && search_tuple);
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
-	
-	ut_ad(sync_thread_levels_empty_gen(FALSE));
-	
+		
 	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
 		fprintf(stderr,
 		"InnoDB: Error: trying to free a corrupt\n"
@@ -2543,6 +2581,9 @@ row_search_for_mysql(
 	
 	printf("N tables locked %lu\n", trx->mysql_n_tables_locked);
 */
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
 	if (direction == 0) {
 		trx->op_info = (char *) "starting index read";
 	
@@ -2608,18 +2649,35 @@ row_search_for_mysql(
 
 	mtr_start(&mtr);
 
-	/* Since we must release the search system latch when we retrieve an
-	externally stored field, we cannot use the adaptive hash index in a
-	search in the case the row may be long and there may be externally
-	stored fields */
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete
+	marked matching record.
+
+	Note that in a unique secondary index there may be different delete
+	marked versions of a record where only the primary key values differ:
+	thus in a secondary index we must use next-key locks when locking
+	delete marked records. */
 	
 	if (match_mode == ROW_SEL_EXACT
-		&& index->type & DICT_UNIQUE
-		&& index->type & DICT_CLUSTERED
-		&& !prebuilt->templ_contains_blob
-		&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
-		&& dtuple_get_n_fields(search_tuple)
+	    && index->type & DICT_UNIQUE
+	    && dtuple_get_n_fields(search_tuple)
 				== dict_index_get_n_unique(index)) {
+		unique_search = TRUE;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (unique_search	
+	    && index->type & DICT_CLUSTERED
+	    && !prebuilt->templ_contains_blob
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
 
 		if (direction == ROW_SEL_NEXT) {
 			/* MySQL sometimes seems to do fetch next even
@@ -2642,8 +2700,9 @@ row_search_for_mysql(
 
 		unique_search_from_clust_index = TRUE;
 
-		if (trx->mysql_n_tables_locked == 0
-					&& !prebuilt->sql_stat_start) {
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {
 
 			/* This is a SELECT query done as a consistent read,
 			and the read view has already been allocated:
@@ -2722,13 +2781,34 @@ row_search_for_mysql(
 			mtr_start(&mtr);
 		}
 	}
-no_shortcut:	
+
+no_shortcut:
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
 	if (trx->has_search_latch) {
 		rw_lock_s_unlock(&btr_search_latch);
 		trx->has_search_latch = FALSE;
 	}			
 
 	trx_start_if_not_started(trx);
+
+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_query_str) {
+
+		/* Scan the MySQL query string; check if SELECT is the first
+	        word there */
+
+		dict_accept(*trx->mysql_query_str, "SELECT", &success);
+
+		if (success) {
+			/* It is a plain locking SELECT and the isolation
+			level is low: do not lock gaps */
+
+			set_also_gap_locks = FALSE;
+		}
+	}
 	
 	/* Note that if the search mode was GE or G, then the cursor
 	naturally moves upward (in fetch next) in alphabetical order,
@@ -2793,8 +2873,10 @@ no_shortcut:
 		prebuilt->sql_stat_start = FALSE;
 	}
 
-	/*-------------------------------------------------------------*/
 rec_loop:
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+	
 	cons_read_requires_clust_rec = FALSE;
 
 	rec = btr_pcur_get_rec(pcur);
@@ -2812,22 +2894,24 @@ rec_loop:
 
 		goto next_rec;
 	}
-				
-	if (prebuilt->select_lock_type != LOCK_NONE) {
-		/* Try to place a lock on the index record */	
 
-		err = sel_set_rec_lock(rec, index, prebuilt->select_lock_type,
-									thr);
-		if (err != DB_SUCCESS) {
+	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
 
-			goto lock_wait_or_error;
-		}
-	}
+		if (prebuilt->select_lock_type != LOCK_NONE
+		    && set_also_gap_locks) {
 
-	if (rec == page_get_supremum_rec(buf_frame_align(rec))) {
+			/* Try to place a lock on the index record */	
+
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_ORDINARY, thr);
+			if (err != DB_SUCCESS) {
 
+				goto lock_wait_or_error;
+			}
+		}
 		/* A page supremum record cannot be in the result set: skip
-		it now when we have placed a possible lock on it */		
+		it now that we have placed a possible lock on it */
 		
 		goto next_rec;
 	}
@@ -2850,6 +2934,19 @@ rec_loop:
 		
 		if (0 != cmp_dtuple_rec(search_tuple, rec)) {
 
+			if (prebuilt->select_lock_type != LOCK_NONE
+		    	    && set_also_gap_locks) {
+				/* Try to place a lock on the index record */	
+
+				err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
 			btr_pcur_store_position(pcur, &mtr);
 
 			ret = DB_RECORD_NOT_FOUND;
@@ -2862,6 +2959,19 @@ rec_loop:
 
 		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec)) {
 			
+			if (prebuilt->select_lock_type != LOCK_NONE
+			    && set_also_gap_locks) {
+				/* Try to place a lock on the index record */	
+
+				err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_GAP, thr);
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+			}
+
 			btr_pcur_store_position(pcur, &mtr);
 
 			ret = DB_RECORD_NOT_FOUND;
@@ -2874,16 +2984,39 @@ rec_loop:
 	/* We are ready to look at a possible new index entry in the result
 	set: the cursor is now placed on a user record */
 
-	/* Get the right version of the row in a consistent read */
-
-	if (prebuilt->select_lock_type == LOCK_NONE) {
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		if (!set_also_gap_locks
+		    || (unique_search && !rec_get_deleted_flag(rec))) {
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_REC_NOT_GAP, thr);
+		} else {
+			err = sel_set_rec_lock(rec, index,
+						prebuilt->select_lock_type,
+						LOCK_ORDINARY, thr);
+		}
+		
+		if (err != DB_SUCCESS) {
 
+			goto lock_wait_or_error;
+		}
+	} else {
 		/* This is a non-locking consistent read: if necessary, fetch
 		a previous version of the record */
 
 		cons_read_requires_clust_rec = FALSE;
 
-		if (index == clust_index) {
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+		
+		} else if (index == clust_index) {
 			
 			if (!lock_clust_rec_cons_read_sees(rec, index,
 							trx->read_view)) {
@@ -3020,8 +3153,11 @@ got_row:
 	ret = DB_SUCCESS;
 
 	goto normal_return;
-	/*-------------------------------------------------------------*/	
+
 next_rec:
+	/*-------------------------------------------------------------*/	
+	/* PHASE 5: Move the cursor to the next index record */
+	
 	if (mtr_has_extra_clust_latch) {
 		/* We must commit mtr if we are moving to the next
 		non-clustered index record, because we could break the
@@ -3064,8 +3200,10 @@ next_rec:
 	cnt++;
 
 	goto rec_loop;
-	/*-------------------------------------------------------------*/
+
 lock_wait_or_error:
+	/*-------------------------------------------------------------*/
+
 	btr_pcur_store_position(pcur, &mtr);
 
 	mtr_commit(&mtr);
@@ -3096,6 +3234,7 @@ lock_wait_or_error:
 	return(err);
 
 normal_return:
+	/*-------------------------------------------------------------*/
 	que_thr_stop_for_mysql_no_error(thr, trx);
 
 	mtr_commit(&mtr);
@@ -3156,10 +3295,12 @@ row_search_check_if_query_cache_permitted(
 
 		ret = TRUE;
 		
-		/* Assign a read view for the transaction if it does not yet
-		have one */
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {
 
-		if (!trx->read_view) {
 			trx->read_view = read_view_open_now(trx,
 						trx->read_view_heap);
 		}
diff --git a/innobase/row/row0uins.c b/innobase/row/row0uins.c
index 9990f893432..fff67dcd627 100644
--- a/innobase/row/row0uins.c
+++ b/innobase/row/row0uins.c
@@ -254,7 +254,8 @@ row_undo_ins_parse_undo_rec(
 	node->table = dict_table_get_on_id(table_id, node->trx);
 
 	if (node->table == NULL) {
-	  return;
+
+		return;
 	}
 
 	clust_index = dict_table_get_first_index(node->table);
@@ -281,7 +282,7 @@ row_undo_ins(
 	
 	ut_ad(node && thr);
 	ut_ad(node->state == UNDO_NODE_INSERT);
-
+	
 	row_undo_ins_parse_undo_rec(node, thr);
 
 	if (node->table == NULL) {
@@ -292,6 +293,7 @@ row_undo_ins(
 
 	if (!found) {
 	        trx_undo_rec_release(node->trx, node->undo_no);
+
 		return(DB_SUCCESS);
 	}
 
diff --git a/innobase/row/row0undo.c b/innobase/row/row0undo.c
index 5119254f405..b40d36533a4 100644
--- a/innobase/row/row0undo.c
+++ b/innobase/row/row0undo.c
@@ -211,7 +211,6 @@ row_undo(
 
 	if (node->state == UNDO_NODE_FETCH_NEXT) {
 
-		/* The call below also starts &mtr */
 		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
 							trx->roll_limit,
 							&roll_ptr,
@@ -254,6 +253,10 @@ row_undo(
 		}
 	}
 
+	/* Prevent DROP TABLE etc. while we are rolling back this row */
+	
+	rw_lock_s_lock(&dict_operation_lock);		
+
 	if (node->state == UNDO_NODE_INSERT) {
 
 		err = row_undo_ins(node, thr);
@@ -264,6 +267,8 @@ row_undo(
 		err = row_undo_mod(node, thr);
 	}
 
+	rw_lock_s_unlock(&dict_operation_lock);		
+
 	/* Do some cleanup */
 	btr_pcur_close(&(node->pcur));
 
diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c
index 25c82f39da9..0be4f901d16 100644
--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -79,7 +79,7 @@ ibool
 row_upd_index_is_referenced(
 /*========================*/
 				/* out: TRUE if referenced; NOTE that since
-				we do not hold dict_foreign_key_check_lock
+				we do not hold dict_operation_lock
 				when leaving the function, it may be that
 				the referencing table has been dropped when
 				we leave this function: this function is only
@@ -95,8 +95,8 @@ row_upd_index_is_referenced(
 		return(FALSE);
 	}
 
-	if (!trx->has_dict_foreign_key_check_lock) {
-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+	if (!trx->has_dict_operation_lock) {
+		rw_lock_s_lock(&dict_operation_lock);
 	}
 
 	foreign = UT_LIST_GET_FIRST(table->referenced_list);
@@ -104,8 +104,8 @@ row_upd_index_is_referenced(
 	while (foreign) {
 		if (foreign->referenced_index == index) {
 
-			if (!trx->has_dict_foreign_key_check_lock) {
-				rw_lock_s_unlock(&dict_foreign_key_check_lock);
+			if (!trx->has_dict_operation_lock) {
+				rw_lock_s_unlock(&dict_operation_lock);
 			}
 
 			return(TRUE);
@@ -114,8 +114,8 @@ row_upd_index_is_referenced(
 		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
 	}
 	
-	if (!trx->has_dict_foreign_key_check_lock) {
-		rw_lock_s_unlock(&dict_foreign_key_check_lock);
+	if (!trx->has_dict_operation_lock) {
+		rw_lock_s_unlock(&dict_operation_lock);
 	}
 
 	return(FALSE);
@@ -162,12 +162,12 @@ row_upd_check_references_constraints(
 
 	mtr_start(mtr);	
 	
-	if (!trx->has_dict_foreign_key_check_lock) {
+	if (!trx->has_dict_operation_lock) {
 		got_s_lock = TRUE;
 
-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+		rw_lock_s_lock(&dict_operation_lock);
 
-		trx->has_dict_foreign_key_check_lock = TRUE;
+		trx->has_dict_operation_lock = TRUE;
 	}
 		
 	foreign = UT_LIST_GET_FIRST(table->referenced_list);
@@ -189,7 +189,7 @@ row_upd_check_references_constraints(
 			}
 
 			/* NOTE that if the thread ends up waiting for a lock
-			we will release dict_foreign_key_check_lock
+			we will release dict_operation_lock
 			temporarily! But the counter on the table
 			protects 'foreign' from being dropped while the check
 			is running. */
@@ -212,8 +212,8 @@ row_upd_check_references_constraints(
 			if (err != DB_SUCCESS) {
 				if (got_s_lock) {
 					rw_lock_s_unlock(
-						&dict_foreign_key_check_lock);	
-					trx->has_dict_foreign_key_check_lock
+						&dict_operation_lock);	
+					trx->has_dict_operation_lock
 								= FALSE;
 				}
 
@@ -227,8 +227,8 @@ row_upd_check_references_constraints(
 	}
 
 	if (got_s_lock) {
-		rw_lock_s_unlock(&dict_foreign_key_check_lock);	
-		trx->has_dict_foreign_key_check_lock = FALSE;
+		rw_lock_s_unlock(&dict_operation_lock);
+		trx->has_dict_operation_lock = FALSE;
 	}
 
 	mem_heap_free(heap);
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index d754f603efc..11e45df4ce3 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -135,8 +135,6 @@ byte	srv_latin1_ordering[256]	/* The sort order table of the latin1
 , 0x44, 0x4E, 0x4F, 0x4F, 0x4F, 0x4F, 0x5D, 0xF7
 , 0xD8, 0x55, 0x55, 0x55, 0x59, 0x59, 0xDE, 0xFF
 };
-
-ibool	srv_use_native_aio	= FALSE;
 		
 ulint	srv_pool_size		= ULINT_MAX;	/* size in database pages;
 						MySQL originally sets this
@@ -151,8 +149,9 @@ dulint	srv_archive_recovery_limit_lsn;
 
 ulint	srv_lock_wait_timeout	= 1024 * 1024 * 1024;
 
-char*   srv_unix_file_flush_method_str = NULL;
-ulint   srv_unix_file_flush_method = 0;
+char*   srv_file_flush_method_str = NULL;
+ulint   srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
+ulint   srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
 
 /* If the following is != 0 we do not allow inserts etc. This protects
 the user from forgetting the innodb_force_recovery keyword to my.cnf */
@@ -281,6 +280,9 @@ time_t	srv_last_monitor_time;
 
 mutex_t srv_innodb_monitor_mutex;
 
+ulint	srv_main_thread_process_no	= 0;
+ulint	srv_main_thread_id		= 0;
+
 /*
 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
 	=========================================
@@ -2046,13 +2048,15 @@ srv_table_reserve_slot_for_mysql(void)
 }
 
 /*******************************************************************
-Puts a MySQL OS thread to wait for a lock to be released. */
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
 
-ibool
+void
 srv_suspend_mysql_thread(
 /*=====================*/
-				/* out: TRUE if the lock wait timeout was
-				exceeded */
 	que_thr_t*	thr)	/* in: query thread associated with the MySQL
 				OS thread */
 {
@@ -2069,13 +2073,15 @@ srv_suspend_mysql_thread(
 
 	mutex_enter(&kernel_mutex);
 
+	trx->error_state = DB_SUCCESS;
+
 	if (thr->state == QUE_THR_RUNNING) {
 
 		/* The lock has already been released: no need to suspend */
 
 		mutex_exit(&kernel_mutex);
 
-		return(FALSE);
+		return;
 	}
 	
 	slot = srv_table_reserve_slot_for_mysql();
@@ -2101,18 +2107,18 @@ srv_suspend_mysql_thread(
 	srv_conc_force_exit_innodb(thr_get_trx(thr));
 
 	/* Release possible foreign key check latch */
-	if (trx->has_dict_foreign_key_check_lock) {
+	if (trx->has_dict_operation_lock) {
 
-		rw_lock_s_unlock(&dict_foreign_key_check_lock);
+		rw_lock_s_unlock(&dict_operation_lock);
 	}
 
 	/* Wait for the release */
 	
 	os_event_wait(event);
 
-	if (trx->has_dict_foreign_key_check_lock) {
+	if (trx->has_dict_operation_lock) {
 
-		rw_lock_s_lock(&dict_foreign_key_check_lock);
+		rw_lock_s_lock(&dict_operation_lock);
 	}
 
 	/* Return back inside InnoDB */
@@ -2131,10 +2137,9 @@ srv_suspend_mysql_thread(
 
 	if (srv_lock_wait_timeout < 100000000 && 
 	    			wait_time > (double)srv_lock_wait_timeout) {
-	   	return(TRUE);
-	}
 
-	return(FALSE);
+	    	trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+	}
 }
 
 /************************************************************************
@@ -2300,9 +2305,19 @@ srv_sprintf_innodb_monitor(
 		       "ROW OPERATIONS\n"
 		       "--------------\n");
 	buf += sprintf(buf,
-	"%ld queries inside InnoDB, %ld queries in queue; main thread: %s\n",
-			srv_conc_n_threads, srv_conc_n_waiting_threads,
+	"%ld queries inside InnoDB, %ld queries in queue\n",
+			srv_conc_n_threads, srv_conc_n_waiting_threads);
+#ifdef UNIV_LINUX
+	buf += sprintf(buf,
+	"Main thread process no %lu, state: %s\n",
+			srv_main_thread_process_no,
+			srv_main_thread_op_info);
+#else
+	buf += sprintf(buf,
+	"Main thread id %lu, state: %s\n",
+			srv_main_thread_id,
 			srv_main_thread_op_info);
+#endif
 	buf += sprintf(buf,
 	"Number of rows inserted %lu, updated %lu, deleted %lu, read %lu\n",
 			srv_n_rows_inserted, 
@@ -2636,6 +2651,9 @@ srv_master_thread(
 	
 	UT_NOT_USED(arg);
 
+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+	
 	srv_table_reserve_slot(SRV_MASTER);	
 
 	mutex_enter(&kernel_mutex);
diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
index dfa122b2ece..d6d610bb5b8 100644
--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -515,7 +515,7 @@ srv_calc_high32(
 }
 
 /*************************************************************************
-Creates or opens the log files. */
+Creates or opens the log files and closes them. */
 static
 ulint
 open_or_create_log_file(
@@ -640,7 +640,7 @@ open_or_create_log_file(
 }
 
 /*************************************************************************
-Creates or opens database data files. */
+Creates or opens database data files and closes them. */
 static
 ulint
 open_or_create_data_files(
@@ -965,31 +965,63 @@ innobase_start_or_create_for_mysql(void)
 
 	srv_is_being_started = TRUE;
         srv_startup_is_before_trx_rollback_phase = TRUE;
+	os_aio_use_native_aio = FALSE;
+
+#ifdef __WIN__
+	if (os_get_os_version() == OS_WIN95
+	    || os_get_os_version() == OS_WIN31
+	    || os_get_os_version() == OS_WINNT) {
+
+	  	/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+		and NT use simulated aio. In NT Windows provides async i/o,
+		but when run in conjunction with InnoDB Hot Backup, it seemed
+		to corrupt the data files. */
+
+	  	os_aio_use_native_aio = FALSE;
+	} else {
+	  	/* On Win 2000 and XP use async i/o */
+	  	os_aio_use_native_aio = TRUE;
+	}
+#endif	
+        if (srv_file_flush_method_str == NULL) {
+        	/* These are the default options */
+
+		srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
 
-	if (0 == ut_strcmp(srv_unix_file_flush_method_str, "fdatasync")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__        
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fdatasync")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_FDATASYNC;
 
-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "O_DSYNC")) {
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
 
-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str,
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
 				  "littlesync")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
 
-	} else if (0 == ut_strcmp(srv_unix_file_flush_method_str, "nosync")) {
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
 	  	srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+	  	os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+	  	os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
+							"async_unbuffered")) {
+	  	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;	
+#endif
 	} else {
 	  	fprintf(stderr, 
           	"InnoDB: Unrecognized value %s for innodb_flush_method\n",
-          				srv_unix_file_flush_method_str);
+          				srv_file_flush_method_str);
 	  	return(DB_ERROR);
 	}
 
-	/*
-	printf("srv_unix set to %lu\n", srv_unix_file_flush_method);
-	*/
-	os_aio_use_native_aio = srv_use_native_aio;
-
 	err = srv_boot();
 
 	if (err != DB_SUCCESS) {
@@ -999,34 +1031,15 @@ innobase_start_or_create_for_mysql(void)
 
 	/* Restrict the maximum number of file i/o threads */
 	if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+
 		srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
 	}
 
-#if !(defined(WIN_ASYNC_IO) || defined(POSIX_ASYNC_IO))
-	/* In simulated aio we currently have use only for 4 threads */
-
-	os_aio_use_native_aio = FALSE;
-
-	srv_n_file_io_threads = 4;
-#endif
-
-#ifdef __WIN__
-	if (os_get_os_version() == OS_WIN95
-	    || os_get_os_version() == OS_WIN31) {
+	if (!os_aio_use_native_aio) {
+ 		/* In simulated aio we currently have use only for 4 threads */
 
-	  	/* On Win 95, 98, ME, and Win32 subsystem for Windows 3.1 use
-	     	simulated aio */
+		srv_n_file_io_threads = 4;
 
-	  	os_aio_use_native_aio = FALSE;
-	  	srv_n_file_io_threads = 4;
-	} else {
-	  	/* On NT and Win 2000 always use aio */
-	  	os_aio_use_native_aio = TRUE;
-	}
-#endif
-	os_aio_use_native_aio = FALSE;
-	
-	if (!os_aio_use_native_aio) {
 		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
 						* srv_n_file_io_threads,
 					srv_n_file_io_threads,
@@ -1047,15 +1060,6 @@ innobase_start_or_create_for_mysql(void)
 	
 	lock_sys_create(srv_lock_table_size);
 
-#ifdef POSIX_ASYNC_IO
-	if (os_aio_use_native_aio) {
-		/* There is only one thread per async io array:
-		one for ibuf i/o, one for log i/o, one for ordinary reads,
-		one for ordinary writes; we need only 4 i/o threads */
-
-		srv_n_file_io_threads = 4;
-	}
-#endif
 	/* Create i/o-handler threads: */
 
 	for (i = 0; i < srv_n_file_io_threads; i++) {
diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
index fe837b119f3..b214bca0470 100644
--- a/innobase/sync/sync0rw.c
+++ b/innobase/sync/sync0rw.c
@@ -663,7 +663,8 @@ rw_lock_own(
 /*========*/
 					/* out: TRUE if locked */
 	rw_lock_t*	lock,		/* in: rw-lock */
-	ulint		lock_type)	/* in: lock type */
+	ulint		lock_type)	/* in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
 {
 	rw_lock_debug_t*	info;
 
diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
index 3ea996afd6b..376be2e723a 100644
--- a/innobase/sync/sync0sync.c
+++ b/innobase/sync/sync0sync.c
@@ -901,8 +901,7 @@ sync_thread_levels_empty_gen(
 
 		if (slot->latch != NULL && (!dict_mutex_allowed ||
 				(slot->level != SYNC_DICT
-				&& slot->level != SYNC_FOREIGN_KEY_CHECK
-				&& slot->level != SYNC_PURGE_IS_RUNNING))) {
+				&& slot->level != SYNC_DICT_OPERATION))) {
 
 			lock = slot->latch;
 			mutex = slot->latch;
@@ -1087,12 +1086,10 @@ sync_thread_add_level(
 						SYNC_IBUF_PESS_INSERT_MUTEX));
 	} else if (level == SYNC_DICT_AUTOINC_MUTEX) {
 		ut_a(sync_thread_levels_g(array, SYNC_DICT_AUTOINC_MUTEX));
-	} else if (level == SYNC_FOREIGN_KEY_CHECK) {
-		ut_a(sync_thread_levels_g(array, SYNC_FOREIGN_KEY_CHECK));
+	} else if (level == SYNC_DICT_OPERATION) {
+		ut_a(sync_thread_levels_g(array, SYNC_DICT_OPERATION));
 	} else if (level == SYNC_DICT_HEADER) {
 		ut_a(sync_thread_levels_g(array, SYNC_DICT_HEADER));
-	} else if (level == SYNC_PURGE_IS_RUNNING) {
-		ut_a(sync_thread_levels_g(array, SYNC_PURGE_IS_RUNNING));
 	} else if (level == SYNC_DICT) {
 		ut_a(buf_debug_prints
 		     || sync_thread_levels_g(array, SYNC_DICT));
diff --git a/innobase/trx/trx0purge.c b/innobase/trx/trx0purge.c
index 97362d00b4b..d58240d3c11 100644
--- a/innobase/trx/trx0purge.c
+++ b/innobase/trx/trx0purge.c
@@ -209,9 +209,6 @@ trx_purge_sys_create(void)
 	purge_sys->purge_undo_no = ut_dulint_zero;
 	purge_sys->next_stored = FALSE;
 	
-	rw_lock_create(&(purge_sys->purge_is_running));
-	rw_lock_set_level(&(purge_sys->purge_is_running),
-						SYNC_PURGE_IS_RUNNING);
 	rw_lock_create(&(purge_sys->latch));
 	rw_lock_set_level(&(purge_sys->latch), SYNC_PURGE_LATCH);
 
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index 994a6777924..7566fe1839e 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -23,7 +23,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "srv0srv.h"
 #include "thr0loc.h"
 #include "btr0sea.h"
-
+#include "os0proc.h"
 
 /* Copy of the prototype for innobase_mysql_print_thd: this
 copy MUST be equal to the one in mysql/sql/ha_innobase.cc ! */
@@ -85,12 +85,14 @@ trx_create(
 	trx->conc_state = TRX_NOT_STARTED;
 	trx->start_time = time(NULL);
 
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
 	trx->check_foreigns = TRUE;
 	trx->check_unique_secondary = TRUE;
 
 	trx->dict_operation = FALSE;
 
 	trx->mysql_thd = NULL;
+	trx->mysql_query_str = NULL;
 
 	trx->n_mysql_tables_in_use = 0;
 	trx->mysql_n_tables_locked = 0;
@@ -132,7 +134,7 @@ trx_create(
 	trx->lock_heap = mem_heap_create_in_buffer(256);
 	UT_LIST_INIT(trx->trx_locks);
 
-	trx->has_dict_foreign_key_check_lock = FALSE;
+	trx->has_dict_operation_lock = FALSE;
 	trx->has_search_latch = FALSE;
 	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
 
@@ -175,6 +177,8 @@ trx_allocate_for_mysql(void)
 	mutex_exit(&kernel_mutex);
 
 	trx->mysql_thread_id = os_thread_get_curr_id();
+
+	trx->mysql_process_no = os_proc_get_number();
 	
 	return(trx);
 }
@@ -1497,9 +1501,12 @@ trx_print(
   		default: buf += sprintf(buf, " state %lu", trx->conc_state);
   	}
 
+#ifdef UNIV_LINUX
+        buf += sprintf(buf, ", process no %lu", trx->mysql_process_no);
+#else
         buf += sprintf(buf, ", OS thread id %lu",
 		       os_thread_pf(trx->mysql_thread_id));
-
+#endif
 	if (ut_strlen(trx->op_info) > 0) {
 		buf += sprintf(buf, " %s", trx->op_info);
 	}
author	unknown <heikki@hundin.mysql.fi>	2002-10-29 23:16:46 +0200
committer	unknown <heikki@hundin.mysql.fi>	2002-10-29 23:16:46 +0200
commit	3cb98f0d66c8030a3532b67ff74e7211cca4c079 (patch)
tree	e57bf300e559932ce45e0f749d7349577e7e0479 /innobase
parent	2d9a473bb67eb5d46ef3facf9384e2b9a621b79e (diff)
download	mariadb-git-3cb98f0d66c8030a3532b67ff74e7211cca4c079.tar.gz