diff options
Diffstat (limited to 'storage/innobase/os/os0file.c')
-rw-r--r-- | storage/innobase/os/os0file.c | 4276 |
1 files changed, 4276 insertions, 0 deletions
diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c new file mode 100644 index 00000000000..d3bd6465f5f --- /dev/null +++ b/storage/innobase/os/os0file.c @@ -0,0 +1,4276 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ +/*********************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2009, Percona Inc. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.c +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "buf0buf.h" +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" +# include "os0thread.h" +#else /* !UNIV_HOTBACKUP */ +# ifdef __WIN__ +/* Add includes for the _stat() call to compile on Windows */ +# include <sys/types.h> +# include <sys/stat.h> +# include <errno.h> +# endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask + = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = 0; +#endif + +#ifdef UNIV_DO_FLUSH +/* If the following is set to TRUE, we do not call os_file_flush in every +os_file_write. We can set this TRUE when the doublewrite buffer is used. */ +UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE; +#else +/* We do not call os_file_flush in every os_file_write. */ +#endif /* UNIV_DO_FLUSH */ + +#ifndef UNIV_HOTBACKUP +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +/** If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio we build below with threads */ + +UNIV_INTERN ibool os_aio_use_native_aio = FALSE; + +/** Flag: enable debug printout for asynchronous i/o */ +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +/** The asynchronous i/o array slot structure */ +typedef struct os_aio_slot_struct os_aio_slot_t; + +/** The asynchronous i/o array slot structure */ +struct os_aio_slot_struct{ + ibool is_read; /*!< TRUE if a read operation */ + ulint pos; /*!< index of the slot in the aio + array */ + ibool reserved; /*!< TRUE if this slot is reserved */ + time_t reservation_time;/*!< time when reserved */ + ulint len; /*!< length of the block to read or + write */ + byte* buf; /*!< buffer used in i/o */ + ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */ + ulint offset; /*!< 32 low bits of file offset in + bytes */ + ulint offset_high; /*!< 32 high bits of file offset */ + os_file_t file; /*!< file where to read or write */ + const char* name; /*!< file name or path */ + ibool io_already_done;/*!< used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + fil_node_t* message1; /*!< message which is given by the */ + void* message2; /*!< the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ +#ifdef WIN_ASYNC_IO + os_event_t event; /*!< event object we need in the + OVERLAPPED struct */ + OVERLAPPED control; /*!< Windows control block for the + aio request */ +#endif +}; + +/** The asynchronous i/o array structure */ +typedef struct os_aio_array_struct os_aio_array_t; + +/** The asynchronous i/o array structure */ +struct os_aio_array_struct{ + os_mutex_t mutex; /*!< the mutex protecting the aio array */ + os_event_t not_full; + /*!< The event which is set to the + signaled state when there is space in + the aio outside the ibuf segment */ + os_event_t is_empty; + /*!< The event which is set to the + signaled state when there are no + pending i/os in this array */ + ulint n_slots;/*!< Total number of slots in the aio + array. This must be divisible by + n_threads. */ + ulint n_segments; + /*!< Number of segments in the aio + array of pending aio requests. A + thread can wait separately for any one + of the segments. */ + ulint n_reserved; + /*!< Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ +#ifdef __WIN__ + os_native_event_t* native_events; + /*!< Pointer to an array of OS native + event handles where we copied the + handles from slots, in the same + order. This can be used in + WaitForMultipleObjects; used only in + Windows */ +#endif +}; + +/** Array of events used in simulated aio */ +static os_event_t* os_aio_segment_wait_events = NULL; + +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. @{ */ +static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */ +static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */ +static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */ +static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */ +static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */ +/* @} */ + +/** Number of asynchronous I/O segments. Set by os_aio_init(). */ +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/** If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +#ifndef UNIV_HOTBACKUP +/** The mutex protecting the following counts of pending I/O operations */ +static os_mutex_t os_file_count_mutex; +#endif /* !UNIV_HOTBACKUP */ +/** Number of pending os_file_pread() operations */ +UNIV_INTERN ulint os_file_n_pending_preads = 0; +/** Number of pending os_file_pwrite() operations */ +UNIV_INTERN ulint os_file_n_pending_pwrites = 0; +/** Number of pending write operations */ +UNIV_INTERN ulint os_n_pending_writes = 0; +/** Number of pending read operations */ +UNIV_INTERN ulint os_n_pending_reads = 0; + +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ +{ +#ifdef __WIN__ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + if (os_info.dwMajorVersion <= 4) { + return(OS_WINNT); + } else { + return(OS_WIN2000); + } + } else { + ut_error; + return(0); + } +#else + ut_error; + + return(0); +#endif +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + ibool report_all_errors) /*!< in: TRUE if we want an error message + printed of all errors */ +{ + ulint err; + +#ifdef __WIN__ + + err = (ulint) GetLastError(); + + if (report_all_errors + || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else { + return(100 + err); + } +#else + err = (ulint) errno; + + if (report_all_errors + || (err != ENOSPC && err != EEXIST)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else { + if (strerror((int)err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu" + " means '%s'.\n", + err, strerror((int)err)); + } + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ENOSPC) { + return(OS_FILE_DISK_FULL); + } else if (err == ENOENT) { + return(OS_FILE_NOT_FOUND); + } else if (err == EEXIST) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) { + return(OS_FILE_PATH_ERROR); + } else { + return(100 + err); + } +#endif +} + +/****************************************************************//** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool should_exit) /*!< in: call exit(3) if unknown error + and this parameter is TRUE */ +{ + ulint err; + + err = os_file_get_last_error(FALSE); + + if (err == OS_FILE_DISK_FULL) { + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + + return(FALSE); + } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) { + + return(TRUE); + } else if (err == OS_FILE_ALREADY_EXISTS + || err == OS_FILE_PATH_ERROR) { + + return(FALSE); + } else if (err == OS_FILE_SHARING_VIOLATION) { + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + } else { + if (name) { + fprintf(stderr, "InnoDB: File name %s\n", name); + } + + fprintf(stderr, "InnoDB: File operation call: '%s'.\n", + operation); + + if (should_exit) { + fprintf(stderr, "InnoDB: Cannot continue operation.\n"); + + fflush(stderr); + + exit(1); + } + } + + return(FALSE); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error( +/*=================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation)/*!< in: operation */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation)/*!< in: operation */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, FALSE)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/****************************************************************//** +Obtain an exclusive lock on a file. +@return 0 on success */ +static +int +os_file_lock( +/*=========*/ + int fd, /*!< in: file descriptor */ + const char* name) /*!< in: file name */ +{ + struct flock lk; + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + if (fcntl(fd, F_SETLK, &lk) == -1) { + fprintf(stderr, + "InnoDB: Unable to lock %s, error: %d\n", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + fprintf(stderr, + "InnoDB: Check that you do not already have" + " another mysqld process\n" + "InnoDB: using the same InnoDB data" + " or log files.\n"); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void) +/*===================*/ +{ + ulint i; + + os_file_count_mutex = os_mutex_create(NULL); + + for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(NULL); + } +} + +/***********************************************************************//** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +On Netware, this function is like tmpfile(3), because the C run-time +library of Netware does not expose the delete-on-close flag. +@return temporary file handle, or NULL on error */ +UNIV_INTERN +FILE* +os_file_create_tmpfile(void) +/*========================*/ +{ +#ifdef __NETWARE__ + FILE* file = tmpfile(); +#else /* __NETWARE__ */ + FILE* file = NULL; + int fd = innobase_mysql_tmpfile(); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } +#endif /* __NETWARE__ */ + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); +#ifndef __NETWARE__ + if (fd >= 0) { + close(fd); + } +#endif /* !__NETWARE__ */ + } + + return(file); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif +} + +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir) /*!< in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir"); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir"); + } + + return(ret); +#endif +} + +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char *) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char *) lpFindFileData->cFileName, ".") == 0 + || strcmp((char *) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char *) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + REFMAN "windows-symbolic-links.html" */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(dirname, + "readdir_next_file"); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + + _POSIX_PATH_MAX + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + +next_file: + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent); + + if (ret != 0) { + fprintf(stderr, + "InnoDB: cannot read directory %s, error %lu\n", + dirname, (ulong)ret); + + return(-1); + } + + if (ent == NULL) { + /* End of directory */ + + return(1); + } + + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + os_file_handle_error_no_exit(full_path, "stat"); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t)statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/*****************************************************************//** +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "CreateDirectory"); + + return(FALSE); + } + + return (TRUE); +#else + int rcode; + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error(pathname, "mkdir"); + + return(FALSE); + } + + return (TRUE); +#endif +} + +/****************************************************************//** +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple( +/*==================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is + opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), or + OS_FILE_CREATE_PATH if new file + (if exists, error) and subdirectories along + its path are created (if needed)*/ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + ut_error; + } + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + access = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + access, + FILE_SHARE_READ | FILE_SHARE_WRITE, + /* file can be read and written also + by other processes */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /*!< no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + + retry = os_file_handle_error(name, + create_mode == OS_FILE_OPEN ? + "open" : "create"); + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + ibool retry; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_CREATE_PATH) { + /* create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + if (!*success) { + return (-1); + } + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error(name, + create_mode == OS_FILE_OPEN ? + "open" : "create"); + if (retry) { + goto try_again; + } +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + close(file); + file = -1; +#endif + } else { + *success = TRUE; + } + + return(file); +#endif /* __WIN__ */ +} + +/****************************************************************//** +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error) */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + access = GENERIC_READ; + share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ + | FILE_SHARE_WRITE; /*!< A backup program has to give + mysqld the maximum freedom to + do what it likes with the + file */ + } else { + access = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /*!< no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; +#ifdef USE_FILE_LOCK + } else if (access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + *success = FALSE; + close(file); + file = -1; +#endif + } else { + *success = TRUE; + } + + return(file); +#endif /* __WIN__ */ +} + +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd, /*!< in: file descriptor to alter */ + const char* file_name, /*!< in: file name, used in the + diagnostic message */ + const char* operation_name) /*!< in: "open" or "create"; used in the + diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save; + errno_save = (int)errno; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Failed to set DIRECTIO_ON " + "on file %s: %s: %s, continuing anyway\n", + file_name, operation_name, strerror(errno_save)); + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save; + errno_save = (int)errno; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Failed to set O_DIRECT " + "on file %s: %s: %s, continuing anyway\n", + file_name, operation_name, strerror(errno_save)); + if (errno_save == EINVAL) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: O_DIRECT is known to result in " + "'Invalid argument' on Linux on tmpfs, " + "see MySQL Bug#26662\n"); + } + } +#endif +} + +/****************************************************************//** +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create( +/*===========*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file + is opened (if does not exist, error), or + OS_FILE_CREATE if a new file is created + (if exists, error), + OS_FILE_OVERWRITE if a new file is created + or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk + partition should be opened */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD share_mode = FILE_SHARE_READ; + DWORD create_flag; + DWORD attributes; + ibool retry; +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN_RAW) { + create_flag = OPEN_EXISTING; + share_mode = FILE_SHARE_WRITE; + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_OVERWRITE) { + create_flag = CREATE_ALWAYS; + } else { + create_flag = 0; + ut_error; + } + + if (purpose == OS_FILE_AIO) { + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + attributes = 0; +#ifdef WIN_ASYNC_IO + if (os_aio_use_native_aio) { + attributes = attributes | FILE_FLAG_OVERLAPPED; + } +#endif +#ifdef UNIV_NON_BUFFERED_IO + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + /* Do not use unbuffered i/o to log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method + == SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; + } +#endif + } else if (purpose == OS_FILE_NORMAL) { + attributes = 0; +#ifdef UNIV_NON_BUFFERED_IO + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + /* Do not use unbuffered i/o to log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method + == SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; + } +#endif + } else { + attributes = 0; + ut_error; + } + + file = CreateFile((LPCTSTR) name, + GENERIC_READ | GENERIC_WRITE, /* read and write + access */ + share_mode, /* File can be read also by other + processes; we must give the read + permission because of ibbackup. We do + not give the write permission to + others because if one would succeed to + start 2 instances of mysqld on the + SAME files, that could cause severe + database corruption! When opening + raw disk partitions, Microsoft manuals + say that we must give also the write + permission. */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /*!< no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + + /* When srv_file_per_table is on, file creation failure may not + be critical to the whole instance. Do not crash the server in + case of unknown errors. */ + if (srv_file_per_table) { + retry = os_file_handle_error_no_exit(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } else { + retry = os_file_handle_error(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } + + if (retry) { + goto try_again; + } + } else { + *success = TRUE; + } + + return(file); +#else /* __WIN__ */ + os_file_t file; + int create_flag; + ibool retry; + const char* mode_str = NULL; + const char* type_str = NULL; + const char* purpose_str = NULL; + +try_again: + ut_a(name); + + if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + mode_str = "OPEN"; + create_flag = O_RDWR; + } else if (create_mode == OS_FILE_CREATE) { + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else if (create_mode == OS_FILE_OVERWRITE) { + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + } else { + create_flag = 0; + ut_error; + } + + if (type == OS_LOG_FILE) { + type_str = "LOG"; + } else if (type == OS_DATA_FILE) { + type_str = "DATA"; + } else { + ut_error; + } + + if (purpose == OS_FILE_AIO) { + purpose_str = "AIO"; + } else if (purpose == OS_FILE_NORMAL) { + purpose_str = "NORMAL"; + } else { + ut_error; + } + +#if 0 + fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n", + name, mode_str, type_str, purpose_str); +#endif +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + if (type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + +# if 0 + fprintf(stderr, "Using O_SYNC for file %s\n", name); +# endif + + create_flag = create_flag | O_SYNC; + } +#endif /* O_SYNC */ + + file = open(name, create_flag, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + /* When srv_file_per_table is on, file creation failure may not + be critical to the whole instance. Do not crash the server in + case of unknown errors. */ + if (srv_file_per_table) { + retry = os_file_handle_error_no_exit(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } else { + retry = os_file_handle_error(name, + create_mode == OS_FILE_CREATE ? + "create" : "open"); + } + + if (retry) { + goto try_again; + } else { + return(file /* -1 */); + } + } + /* else */ + + *success = TRUE; + + /* We disable OS caching (O_DIRECT) only on data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + int i; + ut_print_timestamp(stderr); + fputs(" InnoDB: Retrying to lock" + " the first data file\n", + stderr); + for (i = 0; i < 100; i++) { + os_thread_sleep(1000000); + if (!os_file_lock(file, name)) { + *success = TRUE; + return(file); + } + } + ut_print_timestamp(stderr); + fputs(" InnoDB: Unable to open the first data file\n", + stderr); + } + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + return(file); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_delete_if_exists( +/*=====================*/ + const char* name) /*!< in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(TRUE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running ibbackup" + " to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_delete( +/*===========*/ + const char* name) /*!< in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(FALSE); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running ibbackup" + " to back up the file?\n", name); + + os_file_get_last_error(TRUE); /* print error information */ + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(FALSE); + } + + goto loop; +#else + int ret; + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/***********************************************************************//** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename( +/*===========*/ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath)/*!< in: new file path */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename"); + + return(FALSE); +#else + int ret; + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/***********************************************************************//** +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close( +/*==========*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close"); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +#ifdef UNIV_HOTBACKUP +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif +} +#endif /* UNIV_HOTBACKUP */ + +/***********************************************************************//** +Gets a file size. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_get_size( +/*=============*/ + os_file_t file, /*!< in: handle to a file */ + ulint* size, /*!< out: least significant 32 bits of file + size */ + ulint* size_high)/*!< out: most significant 32 bits of size */ +{ +#ifdef __WIN__ + DWORD high; + DWORD low; + + low = GetFileSize(file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return(FALSE); + } + + *size = low; + *size_high = high; + + return(TRUE); +#else + off_t offs; + + offs = lseek(file, 0, SEEK_END); + + if (offs == ((off_t)-1)) { + + return(FALSE); + } + + if (sizeof(off_t) > 4) { + *size = (ulint)(offs & 0xFFFFFFFFUL); + *size_high = (ulint)(offs >> 32); + } else { + *size = (ulint) offs; + *size_high = 0; + } + + return(TRUE); +#endif +} + +/***********************************************************************//** +Gets file size as a 64-bit integer ib_int64_t. +@return size in bytes, -1 if error */ +UNIV_INTERN +ib_int64_t +os_file_get_size_as_iblonglong( +/*===========================*/ + os_file_t file) /*!< in: handle to a file */ +{ + ulint size; + ulint size_high; + ibool success; + + success = os_file_get_size(file, &size, &size_high); + + if (!success) { + + return(-1); + } + + return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size); +} + +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + ulint size, /*!< in: least significant 32 bits of file + size */ + ulint size_high)/*!< in: most significant 32 bits of size */ +{ + ib_int64_t current_size; + ib_int64_t desired_size; + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + + ut_a(size == (size & 0xFFFFFFFF)); + + current_size = 0; + desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32); + + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE); + + /* Align the buffer for possible raw i/o */ + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, "InnoDB: Progress in MB:"); + } + + while (current_size < desired_size) { + ulint n_bytes; + + if (desired_size - current_size < (ib_int64_t) buf_size) { + n_bytes = (ulint) (desired_size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, + (ulint)(current_size & 0xFFFFFFFF), + (ulint)(current_size >> 32), + n_bytes); + if (!ret) { + ut_free(buf2); + goto error_handling; + } + + /* Print about progress for each 100 MB written */ + if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024) + != current_size / (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, " %lu00", + (ulong) ((current_size + n_bytes) + / (ib_int64_t)(100 * 1024 * 1024))); + } + + current_size += n_bytes; + } + + if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) { + + fprintf(stderr, "\n"); + } + + ut_free(buf2); + + ret = os_file_flush(file); + + if (ret) { + return(TRUE); + } + +error_handling: + return(FALSE); +} + +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file) /*!< in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +#ifndef __WIN__ +/***********************************************************************//** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@return 0 if success, -1 otherwise */ + +static +int +os_file_fsync( +/*==========*/ + os_file_t file) /*!< in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/***********************************************************************//** +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush( +/*==========*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: the OS said file flush did not succeed\n"); + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*******************************************************************//** +Does a synchronous read operation in Posix. +@return number of bytes read, -1 if error */ +static +ssize_t +os_file_pread( +/*==========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint n, /*!< in: number of bytes to read */ + ulint offset, /*!< in: least significant 32 bits of file + offset from where to read */ + ulint offset_high) /*!< in: most significant 32 bits of + offset */ +{ + off_t offs; + ssize_t n_bytes; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file read at offset > 4 GB\n"); + } + } + + os_n_file_reads++; + +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + n_bytes = pread(file, buf, (ssize_t)n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; + ulint i; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + ret = read(file, buf, (ssize_t)n); + } + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif +} + +/*******************************************************************//** +Does a synchronous write operation in Posix. +@return number of bytes written, -1 if error */ +static +ssize_t +os_file_pwrite( +/*===========*/ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from where to write */ + ulint n, /*!< in: number of bytes to write */ + ulint offset, /*!< in: least significant 32 bits of file + offset where to write */ + ulint offset_high) /*!< in: most significant 32 bits of + offset */ +{ + ssize_t ret; + off_t offs; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + + if (sizeof(off_t) > 4) { + offs = (off_t)offset + (((off_t)offset_high) << 32); + } else { + offs = (off_t)offset; + + if (offset_high > 0) { + fprintf(stderr, + "InnoDB: Error: file write" + " at offset > 4 GB\n"); + } + } + + os_n_file_writes++; + +#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD) + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + ret = pwrite(file, buf, (ssize_t)n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + +# ifdef UNIV_DO_FLUSH + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !os_do_not_call_flush_at_each_write) { + + /* Always do fsync to reduce the probability that when + the OS crashes, a database page is only partially + physically written to disk. */ + + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + + return(ret); +#else + { + off_t ret_offset; + ulint i; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + ret = write(file, buf, (ssize_t)n); + +# ifdef UNIV_DO_FLUSH + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC + && srv_unix_file_flush_method != SRV_UNIX_NOSYNC + && !os_do_not_call_flush_at_each_write) { + + /* Always do fsync to reduce the probability that when + the OS crashes, a database page is only partially + physically written to disk. */ + + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + +func_exit: + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif +} +#endif + +/*******************************************************************//** +Requests a synchronous positioned read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read( +/*=========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint offset, /*!< in: least significant 32 bits of file + offset where to read */ + ulint offset_high, /*!< in: most significant 32 bits of + offset */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; + ulint i; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } + + fprintf(stderr, + "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n" + "InnoDB: Was only able to read %ld.\n", + (ulong)n, (ulong)offset_high, + (ulong)offset, (long)ret); +#endif +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error(NULL, "read"); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling( +/*===========================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint offset, /*!< in: least significant 32 bits of file + offset where to read */ + ulint offset_high, /*!< in: most significant 32 bits of + offset */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; + ulint i; + + ut_a((offset & 0xFFFFFFFFUL) == offset); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + return(TRUE); + } +#else + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } +#endif +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error_no_exit(NULL, "read"); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size) /*!< in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*******************************************************************//** +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write( +/*==========*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + ulint offset, /*!< in: least significant 32 bits of file + offset where to write */ + ulint offset_high, /*!< in: most significant 32 bits of + offset */ + ulint n) /*!< in: number of bytes to write */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ulint i; + ulint n_retries = 0; + ulint err; + + ut_a((offset & 0xFFFFFFFF) == offset); + + os_n_file_writes++; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); +retry: + low = (DWORD) offset; + high = (DWORD) offset_high; + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + os_mutex_exit(os_file_count_mutex); + + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); + + ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: File pointer positioning to" + " file %s failed at\n" + "InnoDB: offset %lu %lu. Operating system" + " error number %lu.\n" + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n", + name, (ulong) offset_high, (ulong) offset, + (ulong) GetLastError()); + + return(FALSE); + } + + ret = WriteFile(file, buf, (DWORD) n, &len, NULL); + + /* Always do fsync to reduce the probability that when the OS crashes, + a database page is only partially physically written to disk. */ + +# ifdef UNIV_DO_FLUSH + if (!os_do_not_call_flush_at_each_write) { + ut_a(TRUE == os_file_flush(file)); + } +# endif /* UNIV_DO_FLUSH */ + + os_mutex_exit(os_file_seek_mutexes[i]); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + + err = (ulint)GetLastError(); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %lu %lu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, (ulong) offset_high, (ulong) offset, + (ulong) n, (ulong) len, (ulong) err); + + if (strerror((int)err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int)err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + + ret = os_file_pwrite(file, buf, n, offset, offset_high); + + if ((ulint)ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %lu %lu.\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset_high, offset, n, (long int)ret, + (ulint)errno); + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulint)errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type) /*!< out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat statinfo; + + ret = _stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*******************************************************************//** +This function returns information about the specified file +@return TRUE if stat information found */ +UNIV_INTERN +ibool +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info) /*!< information of a file in a + directory */ +{ +#ifdef __WIN__ + int ret; + struct _stat statinfo; + + ret = _stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(FALSE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_FILE; + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(FALSE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat"); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + stat_info->type = OS_FILE_TYPE_FILE; + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(TRUE); +#endif +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' charac +ters are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path) /*!< in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path) /*!< in: path name */ +{ + char* subdir; + ibool success, subdir_exists; + os_file_type_t type; + + subdir = os_file_dirname(path); + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + success = os_file_status(subdir, &subdir_exists, &type); + if (success && !subdir_exists) { + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + if (!success) { + mem_free(subdir); + + return(FALSE); + } + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Returns a pointer to the nth slot in the aio array. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + os_aio_array_t* array, /*!< in: aio array */ + ulint index) /*!< in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return((array->slots) + index); +} + +/************************************************************************//** +Creates an aio wait array. +@return own: aio array */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + ulint n, /*!< in: maximum number of pending aio operations + allowed; n must be divisible by n_segments */ + ulint n_segments) /*!< in: number of segments in the aio array */ +{ + os_aio_array_t* array; + ulint i; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + OVERLAPPED* over; +#endif + ut_a(n > 0); + ut_a(n_segments > 0); + + array = ut_malloc(sizeof(os_aio_array_t)); + + array->mutex = os_mutex_create(NULL); + array->not_full = os_event_create(NULL); + array->is_empty = os_event_create(NULL); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + array->n_reserved = 0; + array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); +#ifdef __WIN__ + array->native_events = ut_malloc(n * sizeof(os_native_event_t)); +#endif + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + slot->pos = i; + slot->reserved = FALSE; +#ifdef WIN_ASYNC_IO + slot->event = os_event_create(NULL); + + over = &(slot->control); + + over->hEvent = slot->event->handle; + + *((array->native_events) + i) = over->hEvent; +#endif + } + + return(array); +} + +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +void +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync) /*<! in: number of slots in the sync aio + array */ +{ + ulint i; + ulint n_segments = 2 + n_read_segs + n_write_segs; + + ut_ad(n_segments >= 4); + + os_io_init_simple(); + + for (i = 0; i < n_segments; i++) { + srv_set_io_thread_op_info(i, "not started yet"); + } + + + /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[1] = "log thread"; + + os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, + n_read_segs); + for (i = 2; i < 2 + n_read_segs; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + + os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, + n_write_segs); + for (i = 2 + n_read_segs; i < n_segments; i++) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*)); + + for (i = 0; i < n_segments; i++) { + os_aio_segment_wait_events[i] = os_event_create(NULL); + } + + os_last_printout = time(NULL); + +} + +#ifdef WIN_ASYNC_IO +/************************************************************************//** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /*!< in: aio array */ +{ + ulint i; + + for (i = 0; i < array->n_slots; i++) { + + os_event_set((array->slots + i)->event); + } +} +#endif + +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ + ulint i; + +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); +#endif + /* This loop wakes up all simulated ai/o threads */ + + for (i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + os_event_wait(os_aio_write_array->is_empty); +} + +/**********************************************************************//** +Calculates segment number for a slot. +@return segment number (which is the number used by, for example, +i/o-handler threads) */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + os_aio_array_t* array, /*!< in: aio wait array */ + os_aio_slot_t* slot) /*!< in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + segment = 0; + + } else if (array == os_aio_log_array) { + segment = 1; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = 2 + slot->pos / seg_len; + } else { + ut_a(array == os_aio_write_array); + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/**********************************************************************//** +Calculates local segment number and aio array from global segment number. +@return local segment number within the aio array */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + os_aio_array_t** array, /*!< out: aio wait array */ + ulint global_segment)/*!< in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (global_segment == 0) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == 1) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*******************************************************************//** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /*!< in: aio array */ + fil_node_t* message1,/*!< in: message to be passed along with + the aio operation */ + void* message2,/*!< in: message to be passed along with + the aio operation */ + os_file_t file, /*!< in: file handle */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + void* buf, /*!< in: buffer where to read or from which + to write */ + ulint offset, /*!< in: least significant 32 bits of file + offset */ + ulint offset_high, /*!< in: most significant 32 bits of + offset */ + ulint len) /*!< in: length of the block to read or write */ +{ + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; +#endif + ulint i; + ulint slots_per_seg; + ulint local_seg; + + /* No need of a mutex. Only reading constant fields */ + slots_per_seg = array->n_slots / array->n_segments; + + /* We attempt to keep adjacent blocks in the same local + segment. This can help in merging IO requests when we are + doing simulated AIO */ + local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) + % array->n_segments; + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!os_aio_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + /* First try to find a slot in the preferred local segment */ + for (i = local_seg * slots_per_seg; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + /* Fall back to a full scan. We are guaranteed to find a slot */ + for (i = 0;; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + +found: + ut_a(slot->reserved == FALSE); + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = time(NULL); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = buf; + slot->offset = offset; + slot->offset_high = offset_high; + slot->io_already_done = FALSE; + +#ifdef WIN_ASYNC_IO + control = &(slot->control); + control->Offset = (DWORD)offset; + control->OffsetHigh = (DWORD)offset_high; + os_event_reset(slot->event); +#endif + + os_mutex_exit(array->mutex); + + return(slot); +} + +/*******************************************************************//** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /*!< in: aio array */ + os_aio_slot_t* slot) /*!< in: pointer to slot */ +{ + ut_ad(array); + ut_ad(slot); + + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef WIN_ASYNC_IO + os_event_reset(slot->event); +#endif + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /*!< in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint segment; + ulint n; + ulint i; + + ut_ad(!os_aio_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + /* Found an i/o request */ + + break; + } + } + + os_mutex_exit(array->mutex); + + if (i < n) { + os_event_set(os_aio_segment_wait_events[global_segment]); + } +} + +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + ulint i; + + if (os_aio_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void) +/*============================================*/ +{ + os_aio_array_t* array; + ulint g; + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (g = 0; g < os_aio_n_segments; g++) { + os_aio_get_array_and_local_segment(&array, g); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[g]); + } + } +} + +/*******************************************************************//** +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio( +/*===*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + ulint offset, /*!< in: least significant 32 bits of file + offset where to read or write */ + ulint offset_high, /*!< in: most significant 32 bits of + offset */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2)/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + ibool retval; + BOOL ret = TRUE; + DWORD len = (DWORD) n; + struct fil_node_struct * dummy_mess1; + void* dummy_mess2; + ulint dummy_type; +#endif + ulint err = 0; + ibool retry; + ulint wake_later; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate()); + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + if (mode == OS_AIO_SYNC +#ifdef WIN_ASYNC_IO + && !os_aio_use_native_aio +#endif + ) { + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread. NOTE that if we use + Windows async i/o, Windows does not allow us to use + ordinary synchronous os_file_read etc. on the same file, + therefore we have built a special mechanism for synchronous + wait in the Windows case. */ + + if (type == OS_FILE_READ) { + return(os_file_read(file, buf, offset, + offset_high, n)); + } + + ut_a(type == OS_FILE_WRITE); + + return(os_file_write(name, file, buf, offset, offset_high, n)); + } + +try_again: + if (mode == OS_AIO_NORMAL) { + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + array = os_aio_write_array; + } + } else if (mode == OS_AIO_IBUF) { + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + array = os_aio_ibuf_array; + } else if (mode == OS_AIO_LOG) { + + array = os_aio_log_array; + } else if (mode == OS_AIO_SYNC) { + array = os_aio_sync_array; + } else { + array = NULL; /* Eliminate compiler warning */ + ut_error; + } + + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, offset_high, n); + if (type == OS_FILE_READ) { + if (os_aio_use_native_aio) { +#ifdef WIN_ASYNC_IO + os_n_file_reads++; + os_bytes_read_since_printout += len; + + ret = ReadFile(file, buf, (DWORD)n, &len, + &(slot->control)); +#endif + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + if (os_aio_use_native_aio) { +#ifdef WIN_ASYNC_IO + os_n_file_writes++; + ret = WriteFile(file, buf, (DWORD)n, &len, + &(slot->control)); +#endif + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + +#ifdef WIN_ASYNC_IO + if (os_aio_use_native_aio) { + if ((ret && len == n) + || (!ret && GetLastError() == ERROR_IO_PENDING)) { + /* aio was queued successfully! */ + + if (mode == OS_AIO_SYNC) { + /* We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + retval = os_aio_windows_handle(ULINT_UNDEFINED, + slot->pos, + &dummy_mess1, + &dummy_mess2, + &dummy_type); + + return(retval); + } + + return(TRUE); + } + + err = 1; /* Fall through the next if */ + } +#endif + if (err == 0) { + /* aio was queued successfully! */ + + return(TRUE); + } + + os_aio_array_free_slot(array, slot); + + retry = os_file_handle_error(name, + type == OS_FILE_READ + ? "aio read" : "aio write"); + if (retry) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ +{ + ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret_val; + BOOL ret; + DWORD len; + + if (segment == ULINT_UNDEFINED) { + array = os_aio_sync_array; + segment = 0; + } else { + segment = os_aio_get_array_and_local_segment(&array, segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + if (array == os_aio_sync_array) { + os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); + i = pos; + } else { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + i = os_event_wait_multiple(n, + (array->native_events) + + segment * n); + } + + os_mutex_enter(array->mutex); + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + ut_a(slot->reserved); + + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info(orig_seg, + "get windows aio return value"); + } + + ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + if (ret && len == slot->len) { + ret_val = TRUE; + +#ifdef UNIV_DO_FLUSH + if (slot->type == OS_FILE_WRITE + && !os_do_not_call_flush_at_each_write) { + ut_a(TRUE == os_file_flush(slot->file)); + } +#endif /* UNIV_DO_FLUSH */ + } else { + os_file_handle_error(slot->name, "Windows aio"); + + ret_val = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret_val); +} +#endif + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint global_segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* slot; + os_aio_slot_t* slot2; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + ulint lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ulint n; + ulint i; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + os_mutex_enter(array->mutex); + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved && slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + ret = TRUE; + + goto slot_io_done; + } + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + age = (ulint)difftime(time(NULL), + slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, + i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (i = 0; i < n; i++) { + slot2 = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot2->reserved && slot2 != slot + && slot2->offset == slot->offset + slot->len + /* check that sum does not wrap over */ + && slot->offset + slot->len > slot->offset + && slot2->offset_high == slot->offset_high + && slot2->type == slot->type + && slot2->file == slot->file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot2; + n_consecutive++; + + slot = slot2; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + slot = consecutive_ios[0]; + + for (i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); + + ut_a(combined_buf2); + + combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: doing i/o of type %lu at offset %lu %lu," + " length %lu\n", + (ulong) slot->type, (ulong) slot->offset_high, + (ulong) slot->offset, (ulong) total_len); + } + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (slot->type == OS_FILE_WRITE) { + ret = os_file_write(slot->name, slot->file, combined_buf, + slot->offset, slot->offset_high, + total_len); + } else { + ret = os_file_read(slot->file, combined_buf, + slot->offset, slot->offset_high, total_len); + } + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); + +#if 0 + fprintf(stderr, + "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n", + n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE); +#endif + + if (slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(slot->reserved); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o handler thread for i/o" + " segment %lu wakes up\n", + (ulong) global_segment); + } + + goto restart; +} + +/**********************************************************************//** +Validates the consistency of an aio array. +@return TRUE if ok */ +static +ibool +os_aio_array_validate( +/*==================*/ + os_aio_array_t* array) /*!< in: aio wait array */ +{ + os_aio_slot_t* slot; + ulint n_reserved = 0; + ulint i; + + ut_a(array); + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(TRUE); +} + +/**********************************************************************//** +Validates the consistency the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ +{ + os_aio_array_validate(os_aio_read_array); + os_aio_array_validate(os_aio_write_array); + os_aio_array_validate(os_aio_ibuf_array); + os_aio_array_validate(os_aio_log_array); + os_aio_array_validate(os_aio_sync_array); + + return(TRUE); +} + +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n_reserved; + time_t current_time; + double time_elapsed; + double avg_bytes_read; + ulint i; + + for (i = 0; i < srv_n_file_io_threads; i++) { + fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + array = os_aio_read_array; +loop: + ut_a(array); + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + n_reserved = 0; + + for (i = 0; i < array->n_slots; i++) { + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; +#if 0 + fprintf(stderr, "Reserved slot, messages %p %p\n", + (void*) slot->message1, + (void*) slot->message2); +#endif + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_mutex_exit(array->mutex); + + if (array == os_aio_read_array) { + fputs(", aio writes:", file); + + array = os_aio_write_array; + + goto loop; + } + + if (array == os_aio_write_array) { + fputs(",\n ibuf aio reads:", file); + array = os_aio_ibuf_array; + + goto loop; + } + + if (array == os_aio_ibuf_array) { + fputs(", log i/o's:", file); + array = os_aio_log_array; + + goto loop; + } + + if (array == os_aio_log_array) { + fputs(", sync i/o's:", file); + array = os_aio_sync_array; + + goto loop; + } + + putc('\n', file); + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" + "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + fprintf(file, + "%lu pending preads, %lu pending pwrites\n", + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong)avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. +@return TRUE if all free */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ |