11 files changed, 253 insertions, 15 deletions
diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
index 246a60a61cd..3f2476c715b 100644
--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -34,6 +34,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "dict0dict.h"
 #include "log0recv.h"
+#include "log0log.h"
 #include "trx0undo.h"
 #include "srv0srv.h"
 
@@ -273,6 +274,7 @@ buf_page_is_corrupted(
 	ulint	old_checksum;
 	ulint	checksum_field;
 	ulint	old_checksum_field;
+	dulint	current_lsn;
 
 	if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
 	     != mach_read_from_4(read_buf + UNIV_PAGE_SIZE
@@ -284,6 +286,27 @@ buf_page_is_corrupted(
 		return(TRUE);
 	}
 
+#ifndef UNIV_HOTBACKUP
+	if (recv_lsn_checks_on && log_peek_lsn(&current_lsn)) {
+		if (ut_dulint_cmp(current_lsn,
+				  mach_read_from_8(read_buf + FIL_PAGE_LSN))
+				 < 0) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+"  InnoDB: Error: page %lu log sequence number %lu %lu\n"
+"InnoDB: is in the future! Current system log sequence number %lu %lu.\n"
+"InnoDB: Your database may be corrupt.\n",
+		        mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			ut_dulint_get_high(
+				mach_read_from_8(read_buf + FIL_PAGE_LSN)),
+			ut_dulint_get_low(
+				mach_read_from_8(read_buf + FIL_PAGE_LSN)),
+			ut_dulint_get_high(current_lsn),
+			ut_dulint_get_low(current_lsn));
+		}
+	}
+#endif
 	old_checksum = buf_calc_page_old_checksum(read_buf);
 
 	old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
index a8dc357749c..f55df90846c 100644
--- a/innobase/fil/fil0fil.c
+++ b/innobase/fil/fil0fil.c
@@ -831,6 +831,34 @@ fil_space_release_free_extents(
 	mutex_exit(&(system->mutex));
 }
 
+/***********************************************************************
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id)		/* in: space id */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+	ulint		n;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+	
+	ut_a(space);
+
+	n = space->n_reserved_extents;
+	
+	mutex_exit(&(system->mutex));
+
+	return(n);
+}
+
 /************************************************************************
 Prepares a file node for i/o. Opens the file if it is closed. Updates the
 pending i/o's field in the node and the system appropriately. Takes the node
diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c
index b6941d80e90..20bf4972f64 100644
--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -2709,10 +2709,52 @@ fseg_free_page_low(
 		return;
 	}
 
+/*
+	fprintf(stderr,
+"InnoDB: InnoDB is freeing space %lu page %lu,\n"
+"InnoDB: which belongs to descr seg %lu %lu\n"
+"InnoDB: segment %lu %lu.\n",
+		   space, page,
+		   ut_dulint_get_high(
+			mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_low(
+			mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_high(
+		     mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_low(
+		     mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)));
+*/
 	/* If we get here, the page is in some extent of the segment */	
-	ut_a(0 == ut_dulint_cmp(
+	if (0 != ut_dulint_cmp(
 		mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr),
-		mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)));
+		mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))) {
+
+		ut_sprintf_buf(errbuf, descr, 40);
+		fprintf(stderr,
+"InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf);
+		ut_sprintf_buf(errbuf, seg_inode, 40);
+		fprintf(stderr,
+"InnoDB: Dump of the segment inode: %s\n", errbuf);
+
+	        fprintf(stderr,
+"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n"
+"InnoDB: which does not belong to segment %lu %lu but belongs\n"
+"InnoDB: to segment %lu %lu.\n",
+		   space, page,
+		   ut_dulint_get_high(
+			mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_low(
+			mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_high(
+		     mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)),
+		   ut_dulint_get_low(
+		     mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)));
+
+		fprintf(stderr,
+"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n"
+"InnoDB: of http://www.innodb.com/ibman.html about forcing recovery.\n");
+		   ut_a(0);
+	}
 
 	not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
 							MLOG_4BYTES, mtr);
diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h
index 4f78fdb2fd7..ad3149f0b36 100644
--- a/innobase/include/fil0fil.h
+++ b/innobase/include/fil0fil.h
@@ -386,6 +386,14 @@ fil_space_release_free_extents(
 /*===========================*/
 	ulint	id,		/* in: space id */
 	ulint	n_reserved);	/* in: how many one reserved */
+/***********************************************************************
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id);		/* in: space id */
 
 typedef	struct fil_space_struct	fil_space_t;
 
diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h
index e79e914e4f6..24ec28a56e6 100644
--- a/innobase/include/log0log.h
+++ b/innobase/include/log0log.h
@@ -513,6 +513,15 @@ log_print(
 /*======*/
 	char*	buf,	/* in/out: buffer where to print */
 	char*	buf_end);/* in: buffer end */
+/**********************************************************
+Peeks the current lsn. */
+
+ibool
+log_peek_lsn(
+/*=========*/
+			/* out: TRUE if success, FALSE if could not get the
+			log system mutex */
+	dulint*	lsn);	/* out: if returns TRUE, current lsn is here */
 /**************************************************************************
 Refreshes the statistics used to print per-second averages. */
 
@@ -785,6 +794,11 @@ struct log_struct{
 					called */
 
 	/* Fields involved in checkpoints */
+	ulint		log_group_capacity; /* capacity of the log group; if
+					the checkpoint age exceeds this, it is
+					a serious error because it is possible
+					we will then overwrite log and spoil
+					crash recovery */
 	ulint		max_modified_age_async;
 					/* when this recommended value for lsn
 					- buf_pool_get_oldest_modification()
diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h
index 7418e4abf1b..e5a5bc05563 100644
--- a/innobase/include/log0recv.h
+++ b/innobase/include/log0recv.h
@@ -333,6 +333,8 @@ extern ibool		recv_recovery_on;
 extern ibool		recv_no_ibuf_operations;
 extern ibool		recv_needed_recovery;
 
+extern ibool            recv_lsn_checks_on;
+
 extern ibool		recv_is_making_a_backup;
 extern ulint		recv_max_parsed_page_no;
 
diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c
index 53ca97baaa0..99827da5bf5 100644
--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -33,6 +33,11 @@ log_t*	log_sys	= NULL;
 ibool	log_do_write = TRUE;
 ibool	log_debug_writes = FALSE;
 
+/* These control how often we print warnings if the last checkpoint is too
+old */
+ibool	log_has_printed_chkp_warning = FALSE;
+time_t	log_last_warning_time;
+
 /* Pointer to this variable is used as the i/o-message when we do i/o to an
 archive */
 byte	log_archive_io;
@@ -299,6 +304,7 @@ log_close(void)
 	dulint	oldest_lsn;
 	dulint	lsn;
 	log_t*	log	= log_sys;
+	ulint	checkpoint_age;
 
 	ut_ad(mutex_own(&(log->mutex)));
 
@@ -322,8 +328,34 @@ log_close(void)
 		log->check_flush_or_checkpoint = TRUE;
 	}
 
-	if (ut_dulint_minus(lsn, log->last_checkpoint_lsn)
-					<= log->max_modified_age_async) {
+	checkpoint_age = ut_dulint_minus(lsn, log->last_checkpoint_lsn);
+
+	if (checkpoint_age >= log->log_group_capacity) {
+		/* TODO: split btr_store_big_rec_extern_fields() into small
+		steps so that we can release all latches in the middle, and
+		call log_free_check() to ensure we never write over log written
+		after the latest checkpoint. In principle, we should split all
+		big_rec operations, but other operations are smaller. */
+
+		if (!log_has_printed_chkp_warning
+		    || difftime(time(NULL), log_last_warning_time) > 15) {
+
+		        log_has_printed_chkp_warning = TRUE;
+			log_last_warning_time = time(NULL);
+			
+		        ut_print_timestamp(stderr);
+			fprintf(stderr,
+"  InnoDB: ERROR: the age of the last checkpoint is %lu,\n"
+"InnoDB: which exceeds the log group capacity %lu.\n"
+"InnoDB: If you are using big BLOB or TEXT rows, you must set the\n"
+"InnoDB: combined size of log files at least 10 times bigger than the\n"
+"InnoDB: largest such row.\n",
+			checkpoint_age, log->log_group_capacity);
+		}
+	}
+
+	if (checkpoint_age <= log->max_modified_age_async) {
+
 		goto function_exit;
 	}
 
@@ -332,8 +364,7 @@ log_close(void)
 	if (ut_dulint_is_zero(oldest_lsn)
 	    || (ut_dulint_minus(lsn, oldest_lsn)
 					> log->max_modified_age_async)
-	    || (ut_dulint_minus(lsn, log->last_checkpoint_lsn)
-					> log->max_checkpoint_age_async)) {
+	    || checkpoint_age > log->max_checkpoint_age_async) {
 
 		log->check_flush_or_checkpoint = TRUE;
 	}
@@ -551,7 +582,6 @@ log_calc_max_ages(void)
 			the database server */
 {
 	log_group_t*	group;
-	ulint		n_threads;
 	ulint		margin;
 	ulint		free;
 	ibool		success		= TRUE;
@@ -561,8 +591,6 @@ log_calc_max_ages(void)
 
 	ut_ad(!mutex_own(&(log_sys->mutex)));
 
-	n_threads = srv_get_n_threads();
-
 	mutex_enter(&(log_sys->mutex));
 
 	group = UT_LIST_GET_FIRST(log_sys->log_groups);
@@ -590,12 +618,15 @@ log_calc_max_ages(void)
 		group = UT_LIST_GET_NEXT(log_groups, group);
 	}
 	
+	/* Add extra safety */
+	smallest_capacity = smallest_capacity - smallest_capacity / 10;
+
 	/* For each OS thread we must reserve so much free space in the
 	smallest log group that it can accommodate the log entries produced
 	by single query steps: running out of free log space is a serious
 	system error which requires rebooting the database. */
 	
-	free = LOG_CHECKPOINT_FREE_PER_THREAD * n_threads
+	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
 						+ LOG_CHECKPOINT_EXTRA_FREE;
 	if (free >= smallest_capacity / 2) {
 		success = FALSE;
@@ -607,6 +638,10 @@ log_calc_max_ages(void)
 
 	margin = ut_min(margin, log_sys->adm_checkpoint_interval);
 
+	margin = margin - margin / 10;	/* Add still some extra safety */
+
+	log_sys->log_group_capacity = smallest_capacity;
+
 	log_sys->max_modified_age_async = margin
 				- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
 	log_sys->max_modified_age_sync = margin
@@ -626,7 +661,7 @@ failure:
 
 	if (!success) {
 		fprintf(stderr,
-	  "Error: log file group too small for the number of threads\n");
+"InnoDB: Error: log file group too small for innodb_thread_concurrency\n");
 	}
 
 	return(success);
@@ -3112,6 +3147,28 @@ log_check_log_recs(
 }
 
 /**********************************************************
+Peeks the current lsn. */
+
+ibool
+log_peek_lsn(
+/*=========*/
+			/* out: TRUE if success, FALSE if could not get the
+			log system mutex */
+	dulint*	lsn)	/* out: if returns TRUE, current lsn is here */
+{
+	if (0 == mutex_enter_nowait(&(log_sys->mutex), (char*)__FILE__,
+								__LINE__)) {
+	        *lsn = log_sys->lsn;
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************
 Prints info of the log. */
 
 void
diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c
index 8e5fe819afb..9c9ad568d98 100644
--- a/innobase/log/log0recv.c
+++ b/innobase/log/log0recv.c
@@ -46,6 +46,8 @@ ibool		recv_recovery_from_backup_on = FALSE;
 
 ibool		recv_needed_recovery = FALSE;
 
+ibool		recv_lsn_checks_on = FALSE;
+
 /* If the following is TRUE, the buffer pool file pages must be invalidated
 after recovery and no ibuf operations are allowed; this becomes TRUE if
 the log record hash table becomes too full, and log records must be merged
@@ -71,6 +73,12 @@ ulint	recv_previous_parsed_rec_is_multi = 0;
 
 ulint	recv_max_parsed_page_no		= 0;
 
+/* The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+
+dulint	recv_max_page_lsn;
+
 /************************************************************
 Creates the recovery system. */
 
@@ -132,6 +140,8 @@ recv_sys_init(
 						OS_FILE_LOG_BLOCK_SIZE);
 	recv_sys->found_corrupt_log = FALSE;
 
+	recv_max_page_lsn = ut_dulint_zero;
+
 	mutex_exit(&(recv_sys->mutex));
 }
 
@@ -1124,6 +1134,10 @@ recv_recover_page(
 
 	mutex_enter(&(recv_sys->mutex));
 	
+	if (ut_dulint_cmp(recv_max_page_lsn, page_lsn) < 0) {
+		recv_max_page_lsn = page_lsn;
+	}
+
 	recv_addr->state = RECV_PROCESSED;
 
 	ut_a(recv_sys->n_addrs);
@@ -2192,9 +2206,12 @@ recv_scan_log_recs(
 	while (log_block < buf + len && !finished) {
 
 		no = log_block_get_hdr_no(log_block);
+/*
+		fprintf(stderr, "Log block header no %lu\n", no);
 
-		/* fprintf(stderr, "Log block header no %lu\n", no); */
-
+		fprintf(stderr, "Scanned lsn no %lu\n",
+				log_block_convert_lsn_to_no(scanned_lsn));
+*/
 		if (no != log_block_convert_lsn_to_no(scanned_lsn)
 		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
 
@@ -2586,7 +2603,6 @@ recv_recovery_from_checkpoint_start(
 
 		recv_group_scan_log_recs(group, &contiguous_lsn,
 							&group_scanned_lsn);
-
 		group->scanned_lsn = group_scanned_lsn;
 		
 		if (ut_dulint_cmp(old_scanned_lsn, group_scanned_lsn) < 0) {
@@ -2603,6 +2619,31 @@ recv_recovery_from_checkpoint_start(
 		group = UT_LIST_GET_NEXT(log_groups, group);
 	}
 
+	/* We currently have only one log group */
+	if (ut_dulint_cmp(group_scanned_lsn, checkpoint_lsn) < 0) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+"  InnoDB: ERROR: We were only able to scan the log up to\n"
+"InnoDB: %lu %lu, but a checkpoint was at %lu %lu.\n"
+"InnoDB: It is possible that the database is now corrupt!\n",
+			 ut_dulint_get_high(group_scanned_lsn),
+			 ut_dulint_get_low(group_scanned_lsn),
+			 ut_dulint_get_high(checkpoint_lsn),
+			 ut_dulint_get_low(checkpoint_lsn));
+	}
+
+	if (ut_dulint_cmp(group_scanned_lsn, recv_max_page_lsn) < 0) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+"  InnoDB: ERROR: We were only able to scan the log up to %lu %lu\n"
+"InnoDB: but a database page a had an lsn %lu %lu. It is possible that the\n"
+"InnoDB: database is now corrupt!\n",
+			 ut_dulint_get_high(group_scanned_lsn),
+			 ut_dulint_get_low(group_scanned_lsn),
+			 ut_dulint_get_high(recv_max_page_lsn),
+			 ut_dulint_get_low(recv_max_page_lsn));
+	}
+
 	if (ut_dulint_cmp(recv_sys->recovered_lsn, checkpoint_lsn) < 0) {
 
 		mutex_exit(&(log_sys->mutex));
@@ -2656,6 +2697,8 @@ recv_recovery_from_checkpoint_start(
 
 	sync_order_checks_on = FALSE;
 
+	recv_lsn_checks_on = TRUE;
+
 	/* The database is now ready to start almost normal processing of user
 	transactions: transaction rollbacks and the application of the log
 	records in the hash table can be run in background. */
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c
index a9a64ae7864..7936b06c24d 100644
--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -1030,6 +1030,7 @@ os_file_flush(
 #ifdef HAVE_FDATASYNC
 	ret = fdatasync(file);
 #else
+/*	printf("Flushing to file %lu\n", (ulint)file); */
 	ret = fsync(file);
 #endif
 	os_n_fsyncs++;
diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
index f81b2d0a60e..f31d275eff0 100644
--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -1798,8 +1798,12 @@ retry:
 
 		os_fast_mutex_unlock(&srv_conc_mutex);
 
+		trx->op_info = (char*)"sleeping before joining InnoDB queue";
+
 		os_thread_sleep(50000);
 
+		trx->op_info = (char*)"";
+
 		os_fast_mutex_lock(&srv_conc_mutex);
 
 		srv_conc_n_waiting_threads--;
@@ -2299,6 +2303,7 @@ srv_sprintf_innodb_monitor(
 	char*	buf_end	= buf + len - 2000;
 	double	time_elapsed;
 	time_t	current_time;
+	ulint	n_reserved;
 
 	mutex_enter(&srv_innodb_monitor_mutex);
 
@@ -2414,8 +2419,16 @@ srv_sprintf_innodb_monitor(
 		       "ROW OPERATIONS\n"
 		       "--------------\n");
 	buf += sprintf(buf,
-	"%ld queries inside InnoDB, %ld queries in queue\n",
+	"%ld queries inside InnoDB, %lu queries in queue\n",
 			srv_conc_n_threads, srv_conc_n_waiting_threads);
+
+	n_reserved = fil_space_get_n_reserved_extents(0);
+	if (n_reserved > 0) {
+	        buf += sprintf(buf,
+	"%lu tablespace extents now reserved for B-tree split operations\n",
+						    n_reserved);
+	}
+
 #ifdef UNIV_LINUX
 	buf += sprintf(buf,
 	"Main thread process no. %lu, id %lu, state: %s\n",
diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c
index e8d6c093680..47c39290b15 100644
--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -1624,6 +1624,13 @@ trx_print(
 	}
 
 	buf += sprintf(buf, "\n");
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+
+		buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n",
+				    trx->n_mysql_tables_in_use,
+				    trx->mysql_n_tables_locked);
+	}
   	
 	start_of_line = buf;