summaryrefslogtreecommitdiff
path: root/innobase/fil/fil0fil.c
diff options
context:
space:
mode:
Diffstat (limited to 'innobase/fil/fil0fil.c')
-rw-r--r--innobase/fil/fil0fil.c1326
1 files changed, 1326 insertions, 0 deletions
diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c
new file mode 100644
index 00000000000..dcb9698aa27
--- /dev/null
+++ b/innobase/fil/fil0fil.c
@@ -0,0 +1,1326 @@
+/******************************************************
+The low-level file system
+
+(c) 1995 Innobase Oy
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "os0sync.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "buf0buf.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+
+/*
+ IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM
+ ===========================================
+
+The file system is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier.
+
+Some operating systems do not support many open files at the same time,
+though NT seems to tolerate at least 900 open files. Therefore, we put the
+open files in an LRU-list. If we need to open another file, we may close the
+file at the end of the LRU-list. When an i/o-operation is pending on a file,
+the file cannot be closed. We take the file nodes with pending i/o-operations
+out of the LRU-list and keep a count of pending operations. When an operation
+completes, we decrement the count and return the file node to the LRU-list if
+the count drops to zero. */
+
+/* Null file address */
+fil_addr_t fil_addr_null = {FIL_NULL, 0};
+
+/* File system file node data structure */
+typedef struct fil_node_struct fil_node_t;
+struct fil_node_struct {
+ char* name; /* the file name or path */
+ ibool open; /* TRUE if file open */
+ os_file_t handle; /* OS handle to the file, if file open */
+ ulint size; /* size of the file in database blocks
+ (where the possible last incomplete block
+ is ignored) */
+ ulint n_pending;
+ /* count of pending i/o-ops on this file */
+ UT_LIST_NODE_T(fil_node_t) chain;
+ /* link field for the file chain */
+ UT_LIST_NODE_T(fil_node_t) LRU;
+ /* link field for the LRU list */
+ ulint magic_n;
+};
+
+#define FIL_NODE_MAGIC_N 89389
+
+/* File system tablespace or log data structure: let us call them by a common
+name space */
+struct fil_space_struct {
+ char* name; /* space name */
+ ulint id; /* space id */
+ ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */
+ UT_LIST_BASE_NODE_T(fil_node_t) chain;
+ /* base node for the file chain */
+ ulint size; /* space size in pages */
+ ulint n_reserved_extents;
+ /* number of reserved free extents for
+ ongoing operations like B-tree page split */
+ hash_node_t hash; /* hash chain node */
+ rw_lock_t latch; /* latch protecting the file space storage
+ allocation */
+ UT_LIST_NODE_T(fil_space_t) space_list;
+ /* list of all spaces */
+ ibuf_data_t* ibuf_data;
+ /* insert buffer data */
+ ulint magic_n;
+};
+
+#define FIL_SPACE_MAGIC_N 89472
+
+/* The file system data structure */
+
+typedef struct fil_system_struct fil_system_t;
+struct fil_system_struct {
+ mutex_t mutex; /* The mutex protecting the system */
+ hash_table_t* spaces; /* The hash table of spaces in the
+ system */
+ UT_LIST_BASE_NODE_T(fil_node_t) LRU;
+ /* base node for the LRU list of the
+ most recently used open files */
+ ulint n_open_pending; /* current number of open files with
+ pending i/o-ops on them */
+ ulint max_n_open; /* maximum allowed open files */
+ os_event_t can_open; /* this event is set to the signaled
+ state when the system is capable of
+ opening a new file, i.e.,
+ n_open_pending < max_n_open */
+ UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+ /* list of all file spaces */
+};
+
+/* The file system. This variable is NULL before the module is initialized. */
+fil_system_t* fil_system = NULL;
+
+/* The file system hash table size */
+#define FIL_SYSTEM_HASH_SIZE 500
+
+
+/***********************************************************************
+Reserves a right to open a single file. The right must be released with
+fil_release_right_to_open. */
+
+void
+fil_reserve_right_to_open(void)
+/*===========================*/
+{
+loop:
+ mutex_enter(&(fil_system->mutex));
+
+ if (fil_system->n_open_pending == fil_system->max_n_open) {
+
+ /* It is not sure we can open the file if it is closed: wait */
+
+ os_event_reset(fil_system->can_open);
+
+ mutex_exit(&(fil_system->mutex));
+
+ os_event_wait(fil_system->can_open);
+
+ goto loop;
+ }
+
+ fil_system->max_n_open--;
+
+ mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Releases a right to open a single file. */
+
+void
+fil_release_right_to_open(void)
+/*===========================*/
+{
+ mutex_enter(&(fil_system->mutex));
+
+ if (fil_system->n_open_pending == fil_system->max_n_open) {
+
+ os_event_set(fil_system->can_open);
+ }
+
+ fil_system->max_n_open++;
+
+ mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Returns the latch of a file space. */
+
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ /* out: latch protecting storage allocation */
+ ulint id) /* in: space id */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ mutex_exit(&(system->mutex));
+
+ return(&(space->latch));
+}
+
+/***********************************************************************
+Returns the type of a file space. */
+
+ulint
+fil_space_get_type(
+/*===============*/
+ /* out: FIL_TABLESPACE or FIL_LOG */
+ ulint id) /* in: space id */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ mutex_exit(&(system->mutex));
+
+ return(space->purpose);
+}
+
+/***********************************************************************
+Returns the ibuf data of a file space. */
+
+ibuf_data_t*
+fil_space_get_ibuf_data(
+/*====================*/
+ /* out: ibuf data for this space */
+ ulint id) /* in: space id */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ mutex_exit(&(system->mutex));
+
+ return(space->ibuf_data);
+}
+
+/***********************************************************************
+Appends a new file to the chain of files of a space. File must be closed. */
+
+void
+fil_node_create(
+/*============*/
+ char* name, /* in: file name (file must be closed) */
+ ulint size, /* in: file size in database blocks, rounded downwards
+ to an integer */
+ ulint id) /* in: space id where to append */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+ char* name2;
+ fil_system_t* system = fil_system;
+
+ ut_a(system);
+ ut_a(name);
+ ut_a(size > 0);
+
+ mutex_enter(&(system->mutex));
+
+ node = mem_alloc(sizeof(fil_node_t));
+
+ name2 = mem_alloc(ut_strlen(name) + 1);
+
+ ut_strcpy(name2, name);
+
+ node->name = name2;
+ node->open = FALSE;
+ node->size = size;
+ node->magic_n = FIL_NODE_MAGIC_N;
+ node->n_pending = 0;
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ space->size += size;
+
+ UT_LIST_ADD_LAST(chain, space->chain, node);
+
+ mutex_exit(&(system->mutex));
+}
+
+/**************************************************************************
+Closes a file. */
+static
+void
+fil_node_close(
+/*===========*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system) /* in: file system */
+{
+ ibool ret;
+
+ ut_ad(node && system);
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->open);
+ ut_a(node->n_pending == 0);
+
+ ret = os_file_close(node->handle);
+ ut_a(ret);
+
+ node->open = FALSE;
+
+ /* The node is in the LRU list, remove it */
+ UT_LIST_REMOVE(LRU, system->LRU, node);
+}
+
+/***********************************************************************
+Frees a file node object from a file system. */
+static
+void
+fil_node_free(
+/*==========*/
+ fil_node_t* node, /* in, own: file node */
+ fil_system_t* system, /* in: file system */
+ fil_space_t* space) /* in: space where the file node is chained */
+{
+ ut_ad(node && system && space);
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+
+ if (node->open) {
+ fil_node_close(node, system);
+ }
+
+ space->size -= node->size;
+
+ UT_LIST_REMOVE(chain, space->chain, node);
+
+ mem_free(node->name);
+ mem_free(node);
+}
+
+/********************************************************************
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+
+void
+fil_space_truncate_start(
+/*=====================*/
+ ulint id, /* in: space id */
+ ulint trunc_len) /* in: truncate by this much; it is an error
+ if this does not equal to the combined size of
+ some initial files in the space */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ ut_a(space);
+
+ while (trunc_len > 0) {
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len);
+
+ trunc_len -= node->size * UNIV_PAGE_SIZE;
+
+ fil_node_free(node, system, space);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/********************************************************************
+Creates a file system object. */
+static
+fil_system_t*
+fil_system_create(
+/*==============*/
+ /* out, own: file system object */
+ ulint hash_size, /* in: hash table size */
+ ulint max_n_open) /* in: maximum number of open files */
+{
+ fil_system_t* system;
+
+ ut_a(hash_size > 0);
+ ut_a(max_n_open > 0);
+
+ system = mem_alloc(sizeof(fil_system_t));
+
+ mutex_create(&(system->mutex));
+
+ mutex_set_level(&(system->mutex), SYNC_ANY_LATCH);
+
+ system->spaces = hash_create(hash_size);
+
+ UT_LIST_INIT(system->LRU);
+
+ system->n_open_pending = 0;
+ system->max_n_open = max_n_open;
+ system->can_open = os_event_create(NULL);
+
+ UT_LIST_INIT(system->space_list);
+
+ return(system);
+}
+
+/********************************************************************
+Initializes the file system of this module. */
+
+void
+fil_init(
+/*=====*/
+ ulint max_n_open) /* in: max number of open files */
+{
+ ut_a(fil_system == NULL);
+
+ fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open);
+}
+
+/********************************************************************
+Writes the flushed lsn to the header of each file space. */
+
+void
+fil_ibuf_init_at_db_start(void)
+/*===========================*/
+{
+ fil_space_t* space;
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space) {
+ if (space->purpose == FIL_TABLESPACE) {
+ space->ibuf_data = ibuf_data_init_for_space(space->id);
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+}
+
+/********************************************************************
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of a data file. */
+static
+ulint
+fil_write_lsn_and_arch_no_to_file(
+/*==============================*/
+ ulint space_id, /* in: space number */
+ ulint sum_of_sizes, /* in: combined size of previous files in space,
+ in database pages */
+ dulint lsn, /* in: lsn to write */
+ ulint arch_log_no) /* in: archived log number to write */
+{
+ byte* buf1;
+ byte* buf;
+
+ buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
+ buf = ut_align(buf1, UNIV_PAGE_SIZE);
+
+ fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+ mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+ mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no);
+
+ fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file. */
+
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+ /* out: DB_SUCCESS or error number */
+ dulint lsn, /* in: lsn to write */
+ ulint arch_log_no) /* in: latest archived log file number */
+{
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint sum_of_sizes;
+ ulint err;
+
+ mutex_enter(&(fil_system->mutex));
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space) {
+ if (space->purpose == FIL_TABLESPACE) {
+ sum_of_sizes = 0;
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node) {
+ mutex_exit(&(fil_system->mutex));
+
+ err = fil_write_lsn_and_arch_no_to_file(
+ space->id,
+ sum_of_sizes,
+ lsn, arch_log_no);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ mutex_enter(&(fil_system->mutex));
+
+ sum_of_sizes += node->size;
+
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&(fil_system->mutex));
+}
+
+/***********************************************************************
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+ os_file_t data_file, /* in: open data file */
+ ibool one_read_already, /* in: TRUE if min and max parameters
+ below already contain sensible data */
+ dulint* min_flushed_lsn, /* in/out: */
+ ulint* min_arch_log_no, /* in/out: */
+ dulint* max_flushed_lsn, /* in/out: */
+ ulint* max_arch_log_no) /* in/out: */
+{
+ byte* buf;
+ dulint flushed_lsn;
+ ulint arch_log_no;
+
+ buf = ut_malloc(UNIV_PAGE_SIZE);
+
+ os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
+
+ flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN);
+ arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO);
+
+ ut_free(buf);
+
+ if (!one_read_already) {
+ *min_flushed_lsn = flushed_lsn;
+ *max_flushed_lsn = flushed_lsn;
+ *min_arch_log_no = arch_log_no;
+ *max_arch_log_no = arch_log_no;
+
+ return;
+ }
+
+ if (ut_dulint_cmp(*min_flushed_lsn, flushed_lsn) > 0) {
+ *min_flushed_lsn = flushed_lsn;
+ }
+ if (ut_dulint_cmp(*max_flushed_lsn, flushed_lsn) < 0) {
+ *max_flushed_lsn = flushed_lsn;
+ }
+ if (*min_arch_log_no > arch_log_no) {
+ *min_arch_log_no = arch_log_no;
+ }
+ if (*max_arch_log_no < arch_log_no) {
+ *max_arch_log_no = arch_log_no;
+ }
+}
+
+/***********************************************************************
+Creates a space object and puts it to the file system. */
+
+void
+fil_space_create(
+/*=============*/
+ char* name, /* in: space name */
+ ulint id, /* in: space id */
+ ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+ fil_space_t* space;
+ char* name2;
+ fil_system_t* system = fil_system;
+
+ ut_a(system);
+ ut_a(name);
+
+#ifndef UNIV_BASIC_LOG_DEBUG
+ /* Spaces with an odd id number are reserved to replicate spaces
+ used in log debugging */
+
+ ut_a((purpose == FIL_LOG) || (id % 2 == 0));
+#endif
+ mutex_enter(&(system->mutex));
+
+ space = mem_alloc(sizeof(fil_space_t));
+
+ name2 = mem_alloc(ut_strlen(name) + 1);
+
+ ut_strcpy(name2, name);
+
+ space->name = name2;
+ space->id = id;
+ space->purpose = purpose;
+ space->size = 0;
+
+ space->n_reserved_extents = 0;
+
+ UT_LIST_INIT(space->chain);
+ space->magic_n = FIL_SPACE_MAGIC_N;
+
+ space->ibuf_data = NULL;
+
+ rw_lock_create(&(space->latch));
+ rw_lock_set_level(&(space->latch), SYNC_FSP);
+
+ HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
+
+ UT_LIST_ADD_LAST(space_list, system->space_list, space);
+
+ mutex_exit(&(system->mutex));
+}
+
+/***********************************************************************
+Frees a space object from a file system. Closes the files in the chain
+but does not delete them. */
+
+void
+fil_space_free(
+/*===========*/
+ ulint id) /* in: space id */
+{
+ fil_space_t* space;
+ fil_node_t* fil_node;
+ fil_system_t* system = fil_system;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ HASH_DELETE(fil_space_t, hash, system->spaces, id, space);
+
+ UT_LIST_REMOVE(space_list, system->space_list, space);
+
+ ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+
+ ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
+
+ while (fil_node != NULL) {
+ fil_node_free(fil_node, system, space);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+ }
+
+ ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain));
+ ut_ad(0 == UT_LIST_GET_LEN(space->chain));
+
+ mutex_exit(&(system->mutex));
+
+ mem_free(space->name);
+ mem_free(space);
+}
+
+/***********************************************************************
+Returns the size of the space in pages. */
+
+ulint
+fil_space_get_size(
+/*===============*/
+ /* out: space size */
+ ulint id) /* in: space id */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+ ulint size;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ size = space->size;
+
+ mutex_exit(&(system->mutex));
+
+ return(size);
+}
+
+/***********************************************************************
+Tries to reserve free extents in a file space. */
+
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+ /* out: TRUE if succeed */
+ ulint id, /* in: space id */
+ ulint n_free_now, /* in: number of free extents now */
+ ulint n_to_reserve) /* in: how many one wants to reserve */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+ ibool success;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ if (space->n_reserved_extents + n_to_reserve > n_free_now) {
+ success = FALSE;
+ } else {
+ space->n_reserved_extents += n_to_reserve;
+ success = TRUE;
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(success);
+}
+
+/***********************************************************************
+Releases free extents in a file space. */
+
+void
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /* in: space id */
+ ulint n_reserved) /* in: how many one reserved */
+{
+ fil_space_t* space;
+ fil_system_t* system = fil_system;
+
+ ut_ad(system);
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, id, space, space->id == id);
+
+ ut_a(space->n_reserved_extents >= n_reserved);
+
+ space->n_reserved_extents -= n_reserved;
+
+ mutex_exit(&(system->mutex));
+}
+
+/************************************************************************
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system, /* in: file system */
+ fil_space_t* space) /* in: space */
+{
+ ibool ret;
+ fil_node_t* last_node;
+
+ ut_ad(node && system && space);
+ ut_ad(mutex_own(&(system->mutex)));
+
+ if (node->open == FALSE) {
+ /* File is closed */
+ ut_a(node->n_pending == 0);
+
+ /* If too many files are open, close one */
+
+ if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU)
+ == system->max_n_open) {
+
+ ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+ last_node = UT_LIST_GET_LAST(system->LRU);
+
+ fil_node_close(last_node, system);
+ }
+
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, &ret);
+ ut_a(ret);
+
+ node->open = TRUE;
+
+ system->n_open_pending++;
+ node->n_pending = 1;
+
+ /* File was closed: the node was not in the LRU list */
+
+ return;
+ }
+
+ /* File is open */
+ if (node->n_pending == 0) {
+ /* The node is in the LRU list, remove it */
+
+ UT_LIST_REMOVE(LRU, system->LRU, node);
+
+ system->n_open_pending++;
+ node->n_pending = 1;
+ } else {
+ /* There is already a pending i/o-op on the file: the node is
+ not in the LRU list */
+
+ node->n_pending++;
+ }
+}
+
+/************************************************************************
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/os field in the node and the system appropriately. Puts the node
+in the LRU list if there are no other pending i/os. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+ fil_node_t* node, /* in: file node */
+ fil_system_t* system) /* in: file system */
+{
+ ut_ad(node);
+ ut_ad(system);
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->n_pending > 0);
+
+ node->n_pending--;
+
+ if (node->n_pending == 0) {
+ /* The node must be put back to the LRU list */
+ UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+
+ ut_a(system->n_open_pending > 0);
+
+ system->n_open_pending--;
+
+ if (system->n_open_pending == system->max_n_open - 1) {
+
+ os_event_set(system->can_open);
+ }
+ }
+}
+
+/************************************************************************
+Reads or writes data. This operation is asynchronous (aio). */
+
+void
+fil_io(
+/*===*/
+ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE,
+ ORed to OS_FILE_LOG, if a log i/o
+ and ORed to OS_AIO_SIMULATED_WAKE_LATER
+ if simulated aio and we want to post a
+ batch of i/os; NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len, /* in: how many bytes to read; this must
+ not cross a file boundary; in aio this must
+ be a block size multiple */
+ void* buf, /* in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+ void* message) /* in: message for aio handler if non-sync
+ aio used, else ignored */
+{
+ ulint mode;
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint offset_high;
+ ulint offset_low;
+ fil_system_t* system;
+ os_event_t event;
+ ibool ret;
+ ulint is_log;
+ ulint wake_later;
+
+ is_log = type & OS_FILE_LOG;
+ type = type & ~OS_FILE_LOG;
+
+ wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
+ type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+ ut_ad(byte_offset < UNIV_PAGE_SIZE);
+ ut_ad(buf);
+ ut_ad(len > 0);
+ ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE);
+ ut_ad(fil_validate());
+#ifndef UNIV_LOG_DEBUG
+ /* ibuf bitmap pages must be read in the sync aio mode: */
+ ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
+ || !ibuf_bitmap_page(block_offset) || sync || is_log);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
+ || ibuf_page(space_id, block_offset));
+#endif
+#endif
+ if (sync) {
+ mode = OS_AIO_SYNC;
+ } else if ((type == OS_FILE_READ) && !is_log
+ && ibuf_page(space_id, block_offset)) {
+ mode = OS_AIO_IBUF;
+ } else if (is_log) {
+ mode = OS_AIO_LOG;
+ } else {
+ mode = OS_AIO_NORMAL;
+ }
+
+ system = fil_system;
+loop:
+ mutex_enter(&(system->mutex));
+
+ if (system->n_open_pending == system->max_n_open) {
+
+ /* It is not sure we can open the file if it is closed: wait */
+
+ event = system->can_open;
+ os_event_reset(event);
+
+ mutex_exit(&(system->mutex));
+
+ os_event_wait(event);
+
+ goto loop;
+ }
+
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ ut_a(space);
+
+ ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ for (;;) {
+ ut_a(node);
+
+ if (node->size > block_offset) {
+ /* Found! */
+ break;
+ } else {
+ block_offset -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+
+ /* Open file if closed */
+ fil_node_prepare_for_io(node, system, space);
+
+ /* Now we have made the changes in the data structures of system */
+ mutex_exit(&(system->mutex));
+
+ /* Calculate the low 32 bits and the high 32 bits of the file offset */
+
+ offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
+ offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF)
+ + byte_offset;
+
+ ut_a(node->size - block_offset >=
+ (byte_offset + len + (UNIV_PAGE_SIZE - 1)) / UNIV_PAGE_SIZE);
+
+ /* Do aio */
+
+ ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+
+ /* Queue the aio request */
+ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+ offset_low, offset_high, len, node, message);
+ ut_a(ret);
+
+ if (mode == OS_AIO_SYNC) {
+ /* The i/o operation is already completed when we return from
+ os_aio: */
+
+ mutex_enter(&(system->mutex));
+
+ fil_node_complete_io(node, system);
+
+ mutex_exit(&(system->mutex));
+
+ ut_ad(fil_validate());
+ }
+}
+
+/************************************************************************
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_read(
+/*=====*/
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /* in: how many bytes to read; this must not
+ cross a file boundary; in aio this must be a
+ block size multiple */
+ void* buf, /* in/out: buffer where to store data read;
+ in aio this must be appropriately aligned */
+ void* message) /* in: message for aio handler if non-sync
+ aio used, else ignored */
+{
+ fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len,
+ buf, message);
+}
+
+/************************************************************************
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space. */
+
+void
+fil_write(
+/*======*/
+ ibool sync, /* in: TRUE if synchronous aio is desired */
+ ulint space_id, /* in: space id */
+ ulint block_offset, /* in: offset in number of blocks */
+ ulint byte_offset, /* in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /* in: how many bytes to write; this must
+ not cross a file boundary; in aio this must
+ be a block size multiple */
+ void* buf, /* in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ void* message) /* in: message for aio handler if non-sync
+ aio used, else ignored */
+{
+ fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len,
+ buf, message);
+}
+
+/**************************************************************************
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+
+void
+fil_aio_wait(
+/*=========*/
+ ulint segment) /* in: the number of the segment in the aio
+ array to wait for */
+{
+ ibool ret;
+ fil_node_t* fil_node;
+ fil_system_t* system = fil_system;
+ void* message;
+
+ ut_ad(fil_validate());
+
+ if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+ ret = os_aio_windows_handle(segment, 0, &fil_node, &message);
+#elif defined(POSIX_ASYNC_IO)
+ ret = os_aio_posix_handle(segment, &fil_node, &message);
+#else
+ ut_a(0);
+#endif
+ } else {
+ ret = os_aio_simulated_handle(segment, (void**) &fil_node,
+ &message);
+ }
+
+ ut_a(ret);
+
+ mutex_enter(&(system->mutex));
+
+ fil_node_complete_io(fil_node, fil_system);
+
+ mutex_exit(&(system->mutex));
+
+ ut_ad(fil_validate());
+
+ /* Do the i/o handling */
+
+ if (buf_pool_is_block(message)) {
+
+ buf_page_io_complete(message);
+ } else {
+ log_io_complete(message);
+ }
+}
+
+/**************************************************************************
+Flushes to disk possible writes cached by the OS. */
+
+void
+fil_flush(
+/*======*/
+ ulint space_id) /* in: file space id (this can be a group of
+ log files or a tablespace of the database) */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+ fil_node_t* node;
+ os_file_t file;
+
+ mutex_enter(&(system->mutex));
+
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ ut_a(space);
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node) {
+ if (node->open) {
+ file = node->handle;
+
+ mutex_exit(&(system->mutex));
+
+ /* Note that it is not certain, when we have
+ released the mutex above, that the file of the
+ handle is still open: we assume that the OS
+ will not crash or trap even if we pass a handle
+ to a closed file below in os_file_flush! */
+
+ os_file_flush(file);
+
+ mutex_enter(&(system->mutex));
+ }
+
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/**************************************************************************
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+
+void
+fil_flush_file_spaces(
+/*==================*/
+ ulint purpose) /* in: FIL_TABLESPACE, FIL_LOG */
+{
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+
+ mutex_enter(&(system->mutex));
+
+ space = UT_LIST_GET_FIRST(system->space_list);
+
+ while (space) {
+ if (space->purpose == purpose) {
+ mutex_exit(&(system->mutex));
+
+ fil_flush(space->id);
+
+ mutex_enter(&(system->mutex));
+ }
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&(system->mutex));
+}
+
+/**********************************************************************
+Checks the consistency of the file system. */
+
+ibool
+fil_validate(void)
+/*==============*/
+ /* out: TRUE if ok */
+{
+ fil_space_t* space;
+ fil_node_t* fil_node;
+ ulint pending_count = 0;
+ fil_system_t* system;
+ ulint i;
+
+ system = fil_system;
+
+ mutex_enter(&(system->mutex));
+
+ /* Look for spaces in the hash table */
+
+ for (i = 0; i < hash_get_n_cells(system->spaces); i++) {
+
+ space = HASH_GET_FIRST(system->spaces, i);
+
+ while (space != NULL) {
+
+ UT_LIST_VALIDATE(chain, fil_node_t, space->chain);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+
+ while (fil_node != NULL) {
+
+ if (fil_node->n_pending > 0) {
+
+ pending_count++;
+ ut_a(fil_node->open);
+ }
+
+ fil_node = UT_LIST_GET_NEXT(chain, fil_node);
+ }
+
+ space = HASH_GET_NEXT(hash, space);
+ }
+ }
+
+ ut_a(pending_count == system->n_open_pending);
+
+ UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU);
+
+ fil_node = UT_LIST_GET_FIRST(system->LRU);
+
+ while (fil_node != NULL) {
+
+ ut_a(fil_node->n_pending == 0);
+ ut_a(fil_node->open);
+
+ fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+ }
+
+ mutex_exit(&(system->mutex));
+
+ return(TRUE);
+}
+
+/************************************************************************
+Returns TRUE if file address is undefined. */
+ibool
+fil_addr_is_null(
+/*=============*/
+ /* out: TRUE if undefined */
+ fil_addr_t addr) /* in: address */
+{
+ if (addr.page == FIL_NULL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/************************************************************************
+Accessor functions for a file page */
+
+ulint
+fil_page_get_prev(byte* page)
+{
+ return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+ulint
+fil_page_get_next(byte* page)
+{
+ return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/*************************************************************************
+Sets the file page type. */
+
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /* in: file page */
+ ulint type) /* in: type */
+{
+ ut_ad(page);
+ ut_ad((type == FIL_PAGE_INDEX) || (type == FIL_PAGE_INDEX));
+
+ mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/*************************************************************************
+Gets the file page type. */
+
+ulint
+fil_page_get_type(
+/*==============*/
+ /* out: type; NOTE that if the type has not been
+ written to page, the return value not defined */
+ byte* page) /* in: file page */
+{
+ ut_ad(page);
+
+ return(mach_read_from_2(page + FIL_PAGE_TYPE));
+}