1 files changed, 1326 insertions, 0 deletions
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
new file mode 100644
index 00000000000..dcb9698aa27
--- /dev/null
+++ b/innobase/fil/fil0fil.c
@@ -0,0 +1,1326 @@
+/******************************************************
+The low-level file system
+
+(c) 1995 Innobase Oy
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "os0sync.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "buf0buf.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+
+/*
+		IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM
+		===========================================
+
+The file system is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk. 
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier.
+
+Some operating systems do not support many open files at the same time,
+though NT seems to tolerate at least 900 open files. Therefore, we put the
+open files in an LRU-list. If we need to open another file, we may close the
+file at the end of the LRU-list. When an i/o-operation is pending on a file,
+the file cannot be closed. We take the file nodes with pending i/o-operations
+out of the LRU-list and keep a count of pending operations. When an operation
+completes, we decrement the count and return the file node to the LRU-list if
+the count drops to zero. */
+
+/* Null file address */
+fil_addr_t	fil_addr_null = {FIL_NULL, 0};
+
+/* File system file node data structure */
+typedef	struct fil_node_struct	fil_node_t;
+struct fil_node_struct {
+	char*		name;	/* the file name or path */
+	ibool		open;	/* TRUE if file open */
+	os_file_t	handle;	/* OS handle to the file, if file open */
+	ulint		size;	/* size of the file in database blocks
+				(where the possible last incomplete block
+				is ignored) */
+	ulint		n_pending;
+				/* count of pending i/o-ops on this file */
+	UT_LIST_NODE_T(fil_node_t) chain;
+				/* link field for the file chain */
+	UT_LIST_NODE_T(fil_node_t) LRU;
+				/* link field for the LRU list */
+	ulint		magic_n;
+};
+
+#define	FIL_NODE_MAGIC_N	89389
+
+/* File system tablespace or log data structure: let us call them by a common
+name space */
+struct fil_space_struct {
+	char*		name;	/* space name */
+	ulint		id;	/* space id */
+	ulint		purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */
+	UT_LIST_BASE_NODE_T(fil_node_t) chain;
+				/* base node for the file chain */
+	ulint		size;	/* space size in pages */
+	ulint		n_reserved_extents;
+				/* number of reserved free extents for
+				ongoing operations like B-tree page split */
+	hash_node_t	hash; 	/* hash chain node */
+	rw_lock_t	latch;	/* latch protecting the file space storage
+				allocation */
+	UT_LIST_NODE_T(fil_space_t) space_list;
+				/* list of all spaces */
+	ibuf_data_t*	ibuf_data;
+				/* insert buffer data */
+	ulint		magic_n;
+};
+
+#define	FIL_SPACE_MAGIC_N	89472
+
+/* The file system data structure */
+
+typedef	struct fil_system_struct	fil_system_t;
+struct fil_system_struct {
+	mutex_t		mutex;		/* The mutex protecting the system */
+	hash_table_t*	spaces;		/* The hash table of spaces in the
+					system */	
+	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
+					/* base node for the LRU list of the
+					most recently used open files */
+	ulint		n_open_pending;	/* current number of open files with
+					pending i/o-ops on them */
+	ulint		max_n_open;	/* maximum allowed open files */
+	os_event_t	can_open;	/* this event is set to the signaled
+					state when the system is capable of
+					opening a new file, i.e.,
+					n_open_pending < max_n_open */
+	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+					/* list of all file spaces */
+};
+
+/* The file system. This variable is NULL before the module is initialized. */
+fil_system_t*	fil_system	= NULL;
+
+/* The file system hash table size */
+#define	FIL_SYSTEM_HASH_SIZE	500
+
+
+/***********************************************************************
+Reserves a right to open a single file. The right must be released with
+fil_release_right_to_open. */
+
+void
+fil_reserve_right_to_open(void)
+/*===========================*/
+{
+loop:
+	mutex_enter(&(fil_system->mutex));
+	
+	if (fil_system->n_open_pending == fil_system->max_n_open) {
+
+		/* It is not sure we can open the file if it is closed: wait */
+
+		os_event_reset(fil_system->can_open);
+
+		mutex_exit(&(fil_system->mutex));
+
+		os_event_wait(fil_system->can_open);
+
+		goto loop;
+	}
+
+	fil_system->max_n_open--;
+
+	mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Releases a right to open a single file. */
+
+void
+fil_release_right_to_open(void)
+/*===========================*/
+{
+	mutex_enter(&(fil_system->mutex));
+	
+	if (fil_system->n_open_pending == fil_system->max_n_open) {
+
+		os_event_set(fil_system->can_open);
+	}
+
+	fil_system->max_n_open++;
+
+	mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Returns the latch of a file space. */
+
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+			/* out: latch protecting storage allocation */
+	ulint	id)	/* in: space id */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	mutex_exit(&(system->mutex));
+
+	return(&(space->latch));
+}
+
+/***********************************************************************
+Returns the type of a file space. */
+
+ulint
+fil_space_get_type(
+/*===============*/
+			/* out: FIL_TABLESPACE or FIL_LOG */
+	ulint	id)	/* in: space id */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	mutex_exit(&(system->mutex));
+
+	return(space->purpose);
+}
+
+/***********************************************************************
+Returns the ibuf data of a file space. */
+
+ibuf_data_t*
+fil_space_get_ibuf_data(
+/*====================*/
+			/* out: ibuf data for this space */
+	ulint	id)	/* in: space id */
+{
+	fil_space_t*	space;
+	fil_system_t*	system	= fil_system;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	mutex_exit(&(system->mutex));
+
+	return(space->ibuf_data);
+}
+
+/***********************************************************************
+Appends a new file to the chain of files of a space. File must be closed. */
+
+void
+fil_node_create(
+/*============*/
+	char*	name,	/* in: file name (file must be closed) */
+	ulint	size,	/* in: file size in database blocks, rounded downwards
+			to an integer */
+	ulint	id)	/* in: space id where to append */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	char*		name2;
+	fil_system_t*	system		= fil_system;
+
+	ut_a(system);
+	ut_a(name);
+	ut_a(size > 0);
+
+	mutex_enter(&(system->mutex));
+
+	node = mem_alloc(sizeof(fil_node_t));
+
+	name2 = mem_alloc(ut_strlen(name) + 1);
+
+	ut_strcpy(name2, name);
+
+	node->name = name2;
+	node->open = FALSE;
+	node->size = size;
+	node->magic_n = FIL_NODE_MAGIC_N;
+	node->n_pending = 0;
+	
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	space->size += size;
+
+	UT_LIST_ADD_LAST(chain, space->chain, node);
+				
+	mutex_exit(&(system->mutex));
+}
+
+/**************************************************************************
+Closes a file. */
+static
+void
+fil_node_close(
+/*===========*/
+	fil_node_t*	node,	/* in: file node */
+	fil_system_t*	system)	/* in: file system */
+{
+	ibool	ret;
+
+	ut_ad(node && system);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->open);
+	ut_a(node->n_pending == 0);
+
+	ret = os_file_close(node->handle);
+	ut_a(ret);
+
+	node->open = FALSE;
+
+	/* The node is in the LRU list, remove it */
+	UT_LIST_REMOVE(LRU, system->LRU, node);
+}
+
+/***********************************************************************
+Frees a file node object from a file system. */
+static
+void
+fil_node_free(
+/*==========*/
+	fil_node_t*	node,	/* in, own: file node */
+	fil_system_t*	system,	/* in: file system */
+	fil_space_t*	space)	/* in: space where the file node is chained */
+{
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+
+	if (node->open) {
+		fil_node_close(node, system);
+	}
+
+	space->size -= node->size;
+	
+	UT_LIST_REMOVE(chain, space->chain, node);
+
+	mem_free(node->name);
+	mem_free(node);
+}
+
+/********************************************************************
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+
+void
+fil_space_truncate_start(
+/*=====================*/
+	ulint	id,		/* in: space id */
+	ulint	trunc_len)	/* in: truncate by this much; it is an error
+				if this does not equal to the combined size of
+				some initial files in the space */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	ut_a(space);
+	
+	while (trunc_len > 0) {
+
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len);
+
+		trunc_len -= node->size * UNIV_PAGE_SIZE;
+
+		fil_node_free(node, system, space);
+	}	
+				
+	mutex_exit(&(system->mutex));
+}				
+
+/********************************************************************
+Creates a file system object. */
+static
+fil_system_t*
+fil_system_create(
+/*==============*/
+				/* out, own: file system object */
+	ulint	hash_size,	/* in: hash table size */
+	ulint	max_n_open)	/* in: maximum number of open files */
+{
+	fil_system_t*	system;
+
+	ut_a(hash_size > 0);
+	ut_a(max_n_open > 0);
+
+	system = mem_alloc(sizeof(fil_system_t));
+
+	mutex_create(&(system->mutex));
+
+	mutex_set_level(&(system->mutex), SYNC_ANY_LATCH);
+
+	system->spaces = hash_create(hash_size);
+
+	UT_LIST_INIT(system->LRU);
+
+	system->n_open_pending = 0;
+	system->max_n_open = max_n_open;
+	system->can_open = os_event_create(NULL);
+
+	UT_LIST_INIT(system->space_list);
+
+	return(system);
+}
+
+/********************************************************************
+Initializes the file system of this module. */
+
+void
+fil_init(
+/*=====*/
+	ulint	max_n_open)	/* in: max number of open files */
+{
+	ut_a(fil_system == NULL);
+
+	fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open);
+}
+
+/********************************************************************
+Writes the flushed lsn to the header of each file space. */
+
+void
+fil_ibuf_init_at_db_start(void)
+/*===========================*/
+{
+	fil_space_t*	space;
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	
+	while (space) {
+		if (space->purpose == FIL_TABLESPACE) {
+			space->ibuf_data = ibuf_data_init_for_space(space->id);
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+}
+
+/********************************************************************
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of a data file. */
+static
+ulint
+fil_write_lsn_and_arch_no_to_file(
+/*==============================*/
+	ulint	space_id,	/* in: space number */
+	ulint	sum_of_sizes,	/* in: combined size of previous files in space,
+				in database pages */
+	dulint	lsn,		/* in: lsn to write */
+	ulint	arch_log_no)	/* in: archived log number to write */
+{
+	byte*	buf1;
+	byte*	buf;
+
+	buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
+	buf = ut_align(buf1, UNIV_PAGE_SIZE);
+
+	fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+	mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+	mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no);
+
+	fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+	return(DB_SUCCESS);	
+}
+
+/********************************************************************
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file. */
+
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+				/* out: DB_SUCCESS or error number */
+	dulint	lsn,		/* in: lsn to write */
+	ulint	arch_log_no)	/* in: latest archived log file number */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		sum_of_sizes;
+	ulint		err;
+
+	mutex_enter(&(fil_system->mutex));
+	
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+	
+	while (space) {
+		if (space->purpose == FIL_TABLESPACE) {
+			sum_of_sizes = 0;
+
+			node = UT_LIST_GET_FIRST(space->chain);
+
+			while (node) {
+				mutex_exit(&(fil_system->mutex));
+
+				err = fil_write_lsn_and_arch_no_to_file(
+							space->id,
+							sum_of_sizes,
+							lsn, arch_log_no);
+				if (err != DB_SUCCESS) {
+
+					return(err);
+				}
+
+				mutex_enter(&(fil_system->mutex));
+
+				sum_of_sizes += node->size;
+
+				node = UT_LIST_GET_NEXT(chain, node);
+			}
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+
+	mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+	os_file_t data_file,		/* in: open data file */
+	ibool	one_read_already,	/* in: TRUE if min and max parameters
+					below already contain sensible data */
+	dulint*	min_flushed_lsn,	/* in/out: */
+	ulint*	min_arch_log_no,	/* in/out: */
+	dulint*	max_flushed_lsn,	/* in/out: */
+	ulint*	max_arch_log_no)	/* in/out: */
+{
+	byte*	buf;
+	dulint	flushed_lsn;
+	ulint	arch_log_no;
+
+	buf = ut_malloc(UNIV_PAGE_SIZE);
+
+	os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
+
+	flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN);
+	arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO);
+
+	ut_free(buf);
+
+	if (!one_read_already) {
+		*min_flushed_lsn = flushed_lsn;
+		*max_flushed_lsn = flushed_lsn;
+		*min_arch_log_no = arch_log_no;
+		*max_arch_log_no = arch_log_no;
+
+		return;
+	}
+
+	if (ut_dulint_cmp(*min_flushed_lsn, flushed_lsn) > 0) {
+		*min_flushed_lsn = flushed_lsn;
+	}
+	if (ut_dulint_cmp(*max_flushed_lsn, flushed_lsn) < 0) {
+		*max_flushed_lsn = flushed_lsn;
+	}
+	if (*min_arch_log_no > arch_log_no) {
+		*min_arch_log_no = arch_log_no;
+	}
+	if (*max_arch_log_no < arch_log_no) {
+		*max_arch_log_no = arch_log_no;
+	}
+}
+
+/***********************************************************************
+Creates a space object and puts it to the file system. */
+
+void
+fil_space_create(
+/*=============*/
+	char*	name,	/* in: space name */
+	ulint	id,	/* in: space id */
+	ulint	purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+	fil_space_t*	space;	
+	char*		name2;
+	fil_system_t*	system = fil_system;
+	
+	ut_a(system);
+	ut_a(name);
+
+#ifndef UNIV_BASIC_LOG_DEBUG
+	/* Spaces with an odd id number are reserved to replicate spaces
+	used in log debugging */
+	
+	ut_a((purpose == FIL_LOG) || (id % 2 == 0));
+#endif
+	mutex_enter(&(system->mutex));
+
+	space = mem_alloc(sizeof(fil_space_t));
+
+	name2 = mem_alloc(ut_strlen(name) + 1);
+
+	ut_strcpy(name2, name);
+
+	space->name = name2;
+	space->id = id;
+	space->purpose = purpose;
+	space->size = 0;
+
+	space->n_reserved_extents = 0;
+	
+	UT_LIST_INIT(space->chain);
+	space->magic_n = FIL_SPACE_MAGIC_N;
+
+	space->ibuf_data = NULL;
+	
+	rw_lock_create(&(space->latch));
+	rw_lock_set_level(&(space->latch), SYNC_FSP);
+	
+	HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
+
+	UT_LIST_ADD_LAST(space_list, system->space_list, space);
+				
+	mutex_exit(&(system->mutex));
+}
+
+/***********************************************************************
+Frees a space object from a file system. Closes the files in the chain
+but does not delete them. */
+
+void
+fil_space_free(
+/*===========*/
+	ulint	id)	/* in: space id */
+{
+	fil_space_t*	space;
+	fil_node_t*	fil_node;
+	fil_system_t*	system 		= fil_system;
+	
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	HASH_DELETE(fil_space_t, hash, system->spaces, id, space);
+
+	UT_LIST_REMOVE(space_list, system->space_list, space);
+
+	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+
+	fil_node = UT_LIST_GET_FIRST(space->chain);
+
+	ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
+
+	while (fil_node != NULL) {
+		fil_node_free(fil_node, system, space);
+
+		fil_node = UT_LIST_GET_FIRST(space->chain);
+	}	
+	
+	ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
+	ut_ad(0 == UT_LIST_GET_LEN(space->chain));
+
+	mutex_exit(&(system->mutex));
+
+	mem_free(space->name);
+	mem_free(space);
+}
+
+/***********************************************************************
+Returns the size of the space in pages. */
+
+ulint
+fil_space_get_size(
+/*===============*/
+			/* out: space size */
+	ulint	id)	/* in: space id */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+	ulint		size;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	size = space->size;
+	
+	mutex_exit(&(system->mutex));
+
+	return(size);
+}
+
+/***********************************************************************
+Tries to reserve free extents in a file space. */
+
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+				/* out: TRUE if succeed */
+	ulint	id,		/* in: space id */
+	ulint	n_free_now,	/* in: number of free extents now */
+	ulint	n_to_reserve)	/* in: how many one wants to reserve */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+	ibool		success;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	if (space->n_reserved_extents + n_to_reserve > n_free_now) {
+		success = FALSE;
+	} else {
+		space->n_reserved_extents += n_to_reserve;
+		success = TRUE;
+	}
+	
+	mutex_exit(&(system->mutex));
+
+	return(success);
+}
+
+/***********************************************************************
+Releases free extents in a file space. */
+
+void
+fil_space_release_free_extents(
+/*===========================*/
+	ulint	id,		/* in: space id */
+	ulint	n_reserved)	/* in: how many one reserved */
+{
+	fil_space_t*	space;
+	fil_system_t*	system		= fil_system;
+
+	ut_ad(system);
+
+	mutex_enter(&(system->mutex));
+
+	HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+	ut_a(space->n_reserved_extents >= n_reserved);
+	
+	space->n_reserved_extents -= n_reserved;
+	
+	mutex_exit(&(system->mutex));
+}
+
+/************************************************************************
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+	fil_node_t*	node,	/* in: file node */
+	fil_system_t*	system,	/* in: file system */
+	fil_space_t*	space)	/* in: space */
+{
+	ibool		ret;
+	fil_node_t*	last_node;
+
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+	
+	if (node->open == FALSE) {
+		/* File is closed */
+		ut_a(node->n_pending == 0);
+
+		/* If too many files are open, close one */
+
+		if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU)
+						== system->max_n_open) {
+
+		    	ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+			last_node = UT_LIST_GET_LAST(system->LRU);
+
+			fil_node_close(last_node, system);
+		}
+
+		node->handle = os_file_create(node->name, OS_FILE_OPEN,
+							OS_FILE_AIO, &ret);
+		ut_a(ret);
+		
+		node->open = TRUE;
+
+		system->n_open_pending++;
+		node->n_pending = 1;
+
+		/* File was closed: the node was not in the LRU list */
+
+		return;
+	}
+
+	/* File is open */
+	if (node->n_pending == 0) {
+		/* The node is in the LRU list, remove it */
+
+		UT_LIST_REMOVE(LRU, system->LRU, node);
+
+		system->n_open_pending++;
+		node->n_pending = 1;
+	} else {
+		/* There is already a pending i/o-op on the file: the node is
+		not in the LRU list */
+
+		node->n_pending++;
+	}
+}
+
+/************************************************************************
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/os field in the node and the system appropriately. Puts the node
+in the LRU list if there are no other pending i/os. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+	fil_node_t*	node,	/* in: file node */
+	fil_system_t*	system)	/* in: file system */
+{
+	ut_ad(node);
+	ut_ad(system);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->n_pending > 0);
+	
+	node->n_pending--;
+
+	if (node->n_pending == 0) {
+		/* The node must be put back to the LRU list */
+		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+
+		ut_a(system->n_open_pending > 0);
+
+		system->n_open_pending--;
+
+		if (system->n_open_pending == system->max_n_open - 1) {
+
+			os_event_set(system->can_open);
+		}
+	}
+}
+		
+/************************************************************************
+Reads or writes data. This operation is asynchronous (aio). */
+
+void
+fil_io(
+/*===*/
+	ulint	type,		/* in: OS_FILE_READ or OS_FILE_WRITE,
+				ORed to OS_FILE_LOG, if a log i/o
+				and ORed to OS_AIO_SIMULATED_WAKE_LATER
+				if simulated aio and we want to post a
+				batch of i/os; NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	ibool	sync,		/* in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/* in: space id */
+	ulint	block_offset,	/* in: offset in number of blocks */
+	ulint	byte_offset,	/* in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len,		/* in: how many bytes to read; this must
+				not cross a file boundary; in aio this must
+				be a block size multiple */
+	void*	buf,		/* in/out: buffer where to store read data
+				or from where to write; in aio this must be
+				appropriately aligned */
+	void*	message)	/* in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	ulint		mode;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		offset_high;
+	ulint		offset_low;
+	fil_system_t*	system;
+	os_event_t	event;
+	ibool		ret;
+	ulint		is_log;
+	ulint		wake_later;
+
+	is_log = type & OS_FILE_LOG;
+	type = type & ~OS_FILE_LOG;
+
+	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
+	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
+	
+	ut_ad(byte_offset < UNIV_PAGE_SIZE);
+	ut_ad(buf);
+	ut_ad(len > 0);
+	ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE);
+	ut_ad(fil_validate());
+#ifndef UNIV_LOG_DEBUG
+	/* ibuf bitmap pages must be read in the sync aio mode: */
+	ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
+		|| !ibuf_bitmap_page(block_offset) || sync || is_log);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
+					|| ibuf_page(space_id, block_offset));
+#endif
+#endif
+	if (sync) {
+		mode = OS_AIO_SYNC;
+	} else if ((type == OS_FILE_READ) && !is_log
+				&& ibuf_page(space_id, block_offset)) {
+		mode = OS_AIO_IBUF;
+	} else if (is_log) {
+		mode = OS_AIO_LOG;
+	} else {
+		mode = OS_AIO_NORMAL;
+	}
+
+	system = fil_system;
+loop:
+	mutex_enter(&(system->mutex));
+	
+	if (system->n_open_pending == system->max_n_open) {
+
+		/* It is not sure we can open the file if it is closed: wait */
+
+		event = system->can_open;
+		os_event_reset(event);
+
+		mutex_exit(&(system->mutex));
+
+		os_event_wait(event);
+
+		goto loop;
+	}	 
+	
+	HASH_SEARCH(hash, system->spaces, space_id, space,
+						space->id == space_id);
+	ut_a(space);
+
+	ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	for (;;) {
+		ut_a(node);
+
+		if (node->size > block_offset) {
+			/* Found! */
+			break;
+		} else {
+			block_offset -= node->size;
+			node = UT_LIST_GET_NEXT(chain, node);
+		}
+	}		
+	
+	/* Open file if closed */
+	fil_node_prepare_for_io(node, system, space);
+
+	/* Now we have made the changes in the data structures of system */
+	mutex_exit(&(system->mutex));
+
+	/* Calculate the low 32 bits and the high 32 bits of the file offset */
+
+	offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
+	offset_low  = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF)
+			+ byte_offset;
+
+	ut_a(node->size - block_offset >=
+ 		(byte_offset + len + (UNIV_PAGE_SIZE - 1)) / UNIV_PAGE_SIZE);
+
+	/* Do aio */
+
+	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+
+	/* Queue the aio request */
+	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+				offset_low, offset_high, len, node, message);
+	ut_a(ret);
+
+	if (mode == OS_AIO_SYNC) {
+		/* The i/o operation is already completed when we return from
+		os_aio: */
+		
+		mutex_enter(&(system->mutex));
+
+		fil_node_complete_io(node, system);
+
+		mutex_exit(&(system->mutex));
+
+		ut_ad(fil_validate());
+	}
+}
+
+/************************************************************************
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_read(
+/*=====*/
+	ibool	sync,		/* in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/* in: space id */
+	ulint	block_offset,	/* in: offset in number of blocks */
+	ulint	byte_offset,	/* in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/* in: how many bytes to read; this must not
+				cross a file boundary; in aio this must be a
+				block size multiple */
+	void*	buf,		/* in/out: buffer where to store data read;
+				in aio this must be appropriately aligned */
+	void*	message)	/* in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len,
+								buf, message);
+}
+
+/************************************************************************
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_write(
+/*======*/
+	ibool	sync,		/* in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/* in: space id */
+	ulint	block_offset,	/* in: offset in number of blocks */
+	ulint	byte_offset,	/* in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/* in: how many bytes to write; this must
+				not cross a file boundary; in aio this must
+				be a block size multiple */
+	void*	buf,		/* in: buffer from which to write; in aio
+				this must be appropriately aligned */
+	void*	message)	/* in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len,
+								buf, message);
+}
+
+/**************************************************************************
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+
+void
+fil_aio_wait(
+/*=========*/
+	ulint	segment)	/* in: the number of the segment in the aio
+				array to wait for */ 
+{
+	ibool		ret;		
+	fil_node_t*	fil_node;
+	fil_system_t*	system		= fil_system;
+	void*		message;
+	
+	ut_ad(fil_validate());
+
+	if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+		ret = os_aio_windows_handle(segment, 0, &fil_node, &message);
+#elif defined(POSIX_ASYNC_IO)
+		ret = os_aio_posix_handle(segment, &fil_node, &message);
+#else
+		ut_a(0);
+#endif
+	} else {
+		ret = os_aio_simulated_handle(segment, (void**) &fil_node,
+	                                                    &message);
+	}
+	
+	ut_a(ret);
+	
+	mutex_enter(&(system->mutex));
+
+	fil_node_complete_io(fil_node, fil_system);
+
+	mutex_exit(&(system->mutex));
+
+	ut_ad(fil_validate());
+
+	/* Do the i/o handling */
+
+	if (buf_pool_is_block(message)) {
+	
+		buf_page_io_complete(message);
+	} else {
+		log_io_complete(message);
+	}
+}
+
+/**************************************************************************
+Flushes to disk possible writes cached by the OS. */
+
+void
+fil_flush(
+/*======*/
+	ulint	space_id)	/* in: file space id (this can be a group of
+				log files or a tablespace of the database) */
+{
+	fil_system_t*	system	= fil_system;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	os_file_t	file;
+
+	mutex_enter(&(system->mutex));
+	
+	HASH_SEARCH(hash, system->spaces, space_id, space,
+						space->id == space_id);
+	ut_a(space);
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	while (node) {
+		if (node->open) {
+			file = node->handle;
+			
+			mutex_exit(&(system->mutex));
+
+			/* Note that it is not certain, when we have
+			released the mutex above, that the file of the
+			handle is still open: we assume that the OS
+			will not crash or trap even if we pass a handle
+			to a closed file below in os_file_flush! */
+			
+			os_file_flush(file);
+
+			mutex_enter(&(system->mutex));
+		}
+
+		node = UT_LIST_GET_NEXT(chain, node);
+	}		
+
+	mutex_exit(&(system->mutex));
+}
+
+/**************************************************************************
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+
+void
+fil_flush_file_spaces(
+/*==================*/
+	ulint	purpose)	/* in: FIL_TABLESPACE, FIL_LOG */
+{
+	fil_system_t*	system	= fil_system;
+	fil_space_t*	space;
+
+	mutex_enter(&(system->mutex));
+
+	space = UT_LIST_GET_FIRST(system->space_list);
+
+	while (space) {
+		if (space->purpose == purpose) {
+			mutex_exit(&(system->mutex));
+
+			fil_flush(space->id);
+
+			mutex_enter(&(system->mutex));
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+	
+	mutex_exit(&(system->mutex));
+}
+
+/**********************************************************************
+Checks the consistency of the file system. */
+
+ibool
+fil_validate(void)
+/*==============*/
+			/* out: TRUE if ok */
+{	
+	fil_space_t*	space;
+	fil_node_t*	fil_node;
+	ulint		pending_count	= 0;
+	fil_system_t*	system;
+	ulint		i;
+
+	system = fil_system;
+	
+	mutex_enter(&(system->mutex));
+
+	/* Look for spaces in the hash table */
+
+	for (i = 0; i < hash_get_n_cells(system->spaces); i++) {
+
+		space = HASH_GET_FIRST(system->spaces, i);
+	
+		while (space != NULL) {
+
+			UT_LIST_VALIDATE(chain, fil_node_t, space->chain); 
+
+			fil_node = UT_LIST_GET_FIRST(space->chain);
+
+			while (fil_node != NULL) {
+
+				if (fil_node->n_pending > 0) {
+
+					pending_count++;
+					ut_a(fil_node->open);
+				}
+
+				fil_node = UT_LIST_GET_NEXT(chain, fil_node);
+			}
+
+			space = HASH_GET_NEXT(hash, space);
+		}
+	}
+
+	ut_a(pending_count == system->n_open_pending);
+
+	UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU);
+
+	fil_node = UT_LIST_GET_FIRST(system->LRU);
+
+	while (fil_node != NULL) {
+
+		ut_a(fil_node->n_pending == 0);
+		ut_a(fil_node->open);
+
+		fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+	}
+	
+	mutex_exit(&(system->mutex));
+
+	return(TRUE);
+}
+
+/************************************************************************
+Returns TRUE if file address is undefined. */
+ibool
+fil_addr_is_null(
+/*=============*/
+				/* out: TRUE if undefined */
+	fil_addr_t	addr)	/* in: address */
+{
+	if (addr.page == FIL_NULL) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/************************************************************************
+Accessor functions for a file page */
+
+ulint
+fil_page_get_prev(byte*	page)
+{
+	return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+ulint
+fil_page_get_next(byte*	page)
+{
+	return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/*************************************************************************
+Sets the file page type. */
+
+void
+fil_page_set_type(
+/*==============*/
+	byte* 	page,	/* in: file page */
+	ulint	type)	/* in: type */
+{
+	ut_ad(page);
+	ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_INDEX));
+
+	mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}	
+
+/*************************************************************************
+Gets the file page type. */
+
+ulint
+fil_page_get_type(
+/*==============*/
+			/* out: type; NOTE that if the type has not been
+			written to page, the return value not defined */
+	byte* 	page)	/* in: file page */
+{
+	ut_ad(page);
+
+	return(mach_read_from_2(page + FIL_PAGE_TYPE));
+}