diff options
Diffstat (limited to 'storage/xtradb/os/os0file.cc')
-rw-r--r-- | storage/xtradb/os/os0file.cc | 843 |
1 files changed, 772 insertions, 71 deletions
diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 978f3dc7cc4..28f1b156224 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2015, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,10 +43,17 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "btr0types.h" #include "trx0trx.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "unistd.h" +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -73,6 +81,31 @@ Created 10/21/1995 Heikki Tuuri # endif #endif +#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) +#include <sys/statvfs.h> +#endif + +#if defined(UNIV_LINUX) && defined(HAVE_LINUX_FALLOC_H) +#include <linux/falloc.h> +#endif + +#if defined(HAVE_FALLOCATE) +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 +#endif +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 +#endif +#endif + +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + +#ifdef HAVE_SNAPPY +#include "snappy-c.h" +#endif + /** Insert buffer segment id */ static const ulint IO_IBUF_SEGMENT = 0; @@ -196,11 +229,38 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + /*!< TRUE if page compression was + successfull, false if not */ + + ulint file_block_size;/*!< file block size */ + #ifdef LINUX_NATIVE_AIO struct iocb control; /* Linux control block for aio */ int n_bytes; /* bytes written/read. */ int ret; /* AIO return code */ #endif /* WIN_ASYNC_IO */ + byte *lzo_mem; /* Temporal memory used by LZO */ }; /** The asynchronous i/o array structure */ @@ -301,6 +361,66 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ +#endif + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name, /*!< in: name of the file */ + os_file_t file); /*!< in: handle to the file */ + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -537,6 +657,19 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -601,6 +734,20 @@ os_file_get_last_error_low( "InnoDB: The error means mysqld does not have" " the access rights to\n" "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { if (strerror(err) != NULL) { fprintf(stderr, @@ -639,6 +786,9 @@ os_file_get_last_error_low( return(OS_FILE_AIO_RESOURCES_RESERVED); } break; + case ECANCELED: + case ENOTTY: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EINTR: if (srv_use_native_aio) { return(OS_FILE_AIO_INTERRUPTED); @@ -672,7 +822,6 @@ Does error handling when a file operation fails. Conditionally exits (calling exit(3)) based on should_exit value and the error type, if should_exit is TRUE then on_error_silent is ignored. @return TRUE if we should retry the operation */ -static ibool os_file_handle_error_cond_exit( /*===========================*/ @@ -680,9 +829,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -714,6 +865,9 @@ os_file_handle_error_cond_exit( os_has_said_disk_full = TRUE; + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + fflush(stderr); ut_error; @@ -747,6 +901,9 @@ os_file_handle_error_cond_exit( is better to ignore on_error_silent and print an error message to the log. */ + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit || !on_error_silent) { ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " "error " ULINTPF ".%s", name ? name : "(unknown)", @@ -770,10 +927,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -784,12 +943,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -932,7 +1093,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -943,7 +1104,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -965,7 +1126,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -977,7 +1138,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -1049,7 +1210,7 @@ next_file: return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1135,7 +1296,7 @@ next_file: goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1186,7 +1347,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1199,7 +1360,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1309,7 +1470,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1377,7 +1538,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1419,9 +1580,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1482,6 +1646,23 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; @@ -1542,6 +1723,23 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + #endif /* __WIN__ */ return(file); @@ -1625,15 +1823,16 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { - os_file_handle_error_no_exit(name, "ioctl", FALSE); + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); #else - ib_logf(IB_LOG_LEVEL_ERROR, - "trying to enable atomic writes on non-supported platform! " - "Please restart with innodb_use_atomic_writes disabled.\n"); + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); return(FALSE); #endif } @@ -1659,12 +1858,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1807,9 +2009,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1821,11 +2023,21 @@ os_file_create_func( } while (retry); - if (srv_use_atomic_writes && type == OS_DATA_FILE && - !os_file_set_atomic_writes(name, file)) { - CloseHandle(file); + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE && type == OS_DATA_FILE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); *success = FALSE; file = INVALID_HANDLE_VALUE; + } } #else /* __WIN__ */ @@ -1902,9 +2114,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1958,14 +2170,24 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ - if (srv_use_atomic_writes && type == OS_DATA_FILE - && file != -1 && !os_file_set_atomic_writes(name, file)) { - - *success = FALSE; - close(file); - file = -1; + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 && type == OS_DATA_FILE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } } + #endif /* __WIN__ */ return(file); @@ -2024,7 +2246,7 @@ loop: ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2088,7 +2310,7 @@ loop: ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2132,7 +2354,7 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else @@ -2141,7 +2363,7 @@ os_file_rename_func( ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2170,7 +2392,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2179,7 +2401,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2269,6 +2491,12 @@ os_file_set_size( current_size = 0; +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n", + name, os_file_get_size(file), size); +#endif + + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2279,7 +2507,7 @@ os_file_set_size( INT64PF ", desired size " INT64PF "\n", name, current_size, size); os_file_handle_error_no_exit (name, "posix_fallocate", - FALSE); + FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); @@ -2312,6 +2540,7 @@ os_file_set_size( } ret = os_file_write(name, file, buf, current_size, n_bytes); + if (!ret) { ut_free(buf2); goto error_handling; @@ -2466,7 +2695,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2520,7 +2749,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2834,7 +3063,9 @@ os_file_read_func( void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ ulint n, /*!< in: number of bytes to read */ - trx_t* trx) + trx_t* trx, + ibool compressed) /*!< in: is this file space + compressed ? */ { #ifdef __WIN__ BOOL ret; @@ -2876,6 +3107,14 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len, NULL); + } + return(TRUE); } #else /* __WIN__ */ @@ -2888,6 +3127,14 @@ try_again: ret = os_file_pread(file, buf, n, offset, trx); if ((ulint) ret == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } else if (ret == -1) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -2901,7 +3148,7 @@ try_again: n, offset, (lint) ret); } #endif /* __WIN__ */ - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2936,7 +3183,9 @@ os_file_read_no_error_handling_func( os_file_t file, /*!< in: handle to a file */ void* buf, /*!< in: buffer where to read */ os_offset_t offset, /*!< in: file offset where to read */ - ulint n) /*!< in: number of bytes to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed) /*!< in: is this file space + compressed ? */ { #ifdef __WIN__ BOOL ret; @@ -2980,6 +3229,15 @@ try_again: os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } #else /* __WIN__ */ @@ -2992,6 +3250,14 @@ try_again: ret = os_file_pread(file, buf, n, offset, NULL); if ((ulint) ret == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + return(TRUE); } else if (ret == -1) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -3005,7 +3271,7 @@ try_again: n, offset, (lint) ret); } #endif /* __WIN__ */ - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -3071,6 +3337,7 @@ os_file_write_func( ut_ad(buf); ut_ad(n > 0); + retry: os_mutex_enter(os_file_count_mutex); @@ -3244,7 +3511,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3272,7 +3539,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3321,7 +3588,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3374,7 +3641,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3938,7 +4205,7 @@ os_aio_array_create( array->slots = static_cast<os_aio_slot_t*>( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); #if defined(LINUX_NATIVE_AIO) array->aio_ctx = NULL; @@ -4013,6 +4280,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { + ulint i; + os_mutex_free(array->mutex); os_event_free(array->not_full); os_event_free(array->is_empty); @@ -4024,6 +4293,19 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + + if (slot->lzo_mem) { + ut_free(slot->lzo_mem); + slot->lzo_mem = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4368,7 +4650,16 @@ os_aio_array_reserve_slot( to write */ os_offset_t offset, /*!< in: file offset */ ulint len, /*!< in: length of the block to read or write */ - ulint space_id) + ulint space_id, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4460,6 +4751,59 @@ found: slot->io_already_done = FALSE; slot->space_id = space_id; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + if (message1) { + slot->file_block_size = fil_node_get_block_size(message1); + } + + /* If the space is page compressed and this is write operation + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + os_slot_alloc_page_buf(slot); + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3) { + os_slot_alloc_lzo_mem(slot); + } +#endif + + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + fil_node_get_block_size(slot->message1), + &real_len, + slot->lzo_mem + ); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + } + #ifdef WIN_ASYNC_IO control = &slot->control; control->Offset = (DWORD) offset & 0xFFFFFFFF; @@ -4735,7 +5079,16 @@ os_aio_func( aio operation); ignored if mode is OS_AIO_SYNC */ ulint space_id, - trx_t* trx) + trx_t* trx, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4766,7 +5119,8 @@ os_aio_func( no need to use an i/o-handler thread */ if (type == OS_FILE_READ) { - ret = os_file_read_func(file, buf, offset, n, trx); + ret = os_file_read_func(file, buf, offset, n, trx, page_compression); + } else { ut_ad(!srv_read_only_mode); ut_a(type == OS_FILE_WRITE); @@ -4777,14 +5131,10 @@ os_aio_func( os_has_said_disk_full = FALSE; ret = 0; errno = 28;); if (!ret) { - os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE); + os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE, __FILE__, __LINE__); } } - if (!ret) { - fprintf(stderr, "FAIL"); - } - return ret; } @@ -4835,8 +5185,11 @@ try_again: trx->io_reads++; trx->io_read += n; } + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n, space_id); + name, buf, offset, n, space_id, + page_compression, page_compression_level, write_size); + if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4894,7 +5247,7 @@ err_exit: os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4994,7 +5347,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -5022,12 +5375,18 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret_val = os_file_write(slot->name, slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + if (slot->message1 && slot->page_compression && slot->page_compress_success && slot->page_buf) { + ret_val = os_file_write(slot->name, slot->file, slot->page_buf, + slot->offset, slot->len); + } else { + + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->offset, slot->len); + } break; case OS_FILE_READ: - ret_val = os_file_read(slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + ret_val = os_file_read(slot->file, slot->buf, + slot->offset, slot->len, slot->page_compression); break; default: ut_error; @@ -5052,6 +5411,27 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + os_slot_alloc_page_buf(slot); + +#ifdef HAVE_LZO + if (fil_page_is_lzo_compressed(slot->buf)) { + os_slot_alloc_lzo_mem(slot); + } +#endif + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + /* OS_FILE_WRITE */ + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot); + } + } + } + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); return(ret_val); @@ -5141,6 +5521,33 @@ retry: /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + if (slot->type == OS_FILE_READ) { + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (fil_page_is_compressed(slot->buf)) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + os_slot_alloc_page_buf(slot); +#ifdef HAVE_LZO + if (fil_page_is_lzo_compressed(slot->buf)) { + os_slot_alloc_lzo_mem(slot); + } +#endif + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } + } else { + /* OS_FILE_WRITE */ + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot); + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5322,6 +5729,13 @@ found: } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5329,7 +5743,7 @@ found: windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5612,13 +6026,14 @@ consecutive_loop: os_has_said_disk_full = FALSE; ret = 0; errno = 28;); if (!ret) { - os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE); + os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE, __FILE__, __LINE__); } } else { ret = os_file_read( aio_slot->file, combined_buf, - aio_slot->offset, total_len); + aio_slot->offset, total_len, + aio_slot->page_compression); } srv_set_io_thread_op_info(global_segment, "file i/o done"); @@ -6010,3 +6425,289 @@ os_aio_all_slots_free(void) #endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef _WIN32 +#include <winioctl.h> +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + size_t len = slot->len; + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + size_t bsize = slot->file_block_size; + + // len here should be alligned to sector size + ut_ad((trim_len % bsize) == 0); + ut_ad((len % bsize) == 0); + ut_ad(bsize != 0); + ut_ad((off % bsize) == 0); + +#ifdef UNIV_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu off %lu block_size %lu\n", + *slot->write_size, trim_len, len, off, bsize); +#endif + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(HAVE_FALLOCATE) + int ret = fallocate(slot->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + srv_use_trim = FALSE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lu len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, off, trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + srv_use_trim = FALSE; + if (slot->write_size) { + *slot->write_size = 0; + } + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(slot->file, FSCTL_FILE_LEVEL_TRIM, + &flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + srv_use_trim = FALSE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#endif + + switch(bsize) { + case 512: + srv_stats.page_compression_trim_sect512.add((trim_len / bsize)); + break; + case 1024: + srv_stats.page_compression_trim_sect1024.add((trim_len / bsize)); + break; + case 2948: + srv_stats.page_compression_trim_sect2048.add((trim_len / bsize)); + break; + case 4096: + srv_stats.page_compression_trim_sect4096.add((trim_len / bsize)); + break; + case 8192: + srv_stats.page_compression_trim_sect8192.add((trim_len / bsize)); + break; + case 16384: + srv_stats.page_compression_trim_sect16384.add((trim_len / bsize)); + break; + case 32768: + srv_stats.page_compression_trim_sect32768.add((trim_len / bsize)); + break; + default: + break; + } + + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + ulint asize = UNIV_PAGE_SIZE; + + ut_a(slot != NULL); + if (slot->page_compression_page == NULL) { + /* We allocate extra to avoid memory overwrite on compression */ +#ifdef HAVE_SNAPPY + asize += snappy_max_compressed_length(asize) - UNIV_PAGE_SIZE; +#endif + cbuf2 = static_cast<byte *>(ut_malloc(asize*2)); + cbuf = static_cast<byte *>(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast<byte *>(cbuf2); + slot->page_buf = static_cast<byte *>(cbuf); + memset(slot->page_compression_page, 0, asize*2); + ut_a(slot->page_buf != NULL); + } +} + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + ut_a(slot != NULL); + if(slot->lzo_mem == NULL) { + slot->lzo_mem = static_cast<byte *>(ut_malloc(LZO1X_1_15_MEM_COMPRESS)); + memset(slot->lzo_mem, 0, LZO1X_1_15_MEM_COMPRESS); + ut_a(slot->lzo_mem != NULL); + } +} +#endif + +/***********************************************************************//** +Try to get number of bytes per sector from file system. +@return file block size */ +UNIV_INTERN +ulint +os_file_get_block_size( +/*===================*/ + os_file_t file, /*!< in: handle to a file */ + const char* name) /*!< in: file name */ +{ + ulint fblock_size = 512; + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H) + struct statvfs fstat; + int err; + + err = fstatvfs(file, &fstat); + + if (err != 0) { + fprintf(stderr, "InnoDB: Warning: fstatvfs() failed on file %s\n", name); + os_file_handle_error_no_exit(name, "fstatvfs()", FALSE, __FILE__, __LINE__); + } else { + fblock_size = fstat.f_bsize; + } +#endif /* UNIV_LINUX */ +#ifdef __WIN__ + { + DWORD SectorsPerCluster = 0; + DWORD BytesPerSector = 0; + DWORD NumberOfFreeClusters = 0; + DWORD TotalNumberOfClusters = 0; + + if (GetFreeSpace((LPCTSTR)name, &SectorsPerCluster, &BytesPerSector, &NumberOfFreeClusters, &TotalNumberOfClusters)) { + fblock_size = BytesPerSector; + } else { + fprintf(stderr, "InnoDB: Warning: GetFreeSpace() failed on file %s\n", name); + os_file_handle_error_no_exit(name, "GetFreeSpace()", FALSE, __FILE__, __LINE__); + } + } +#endif /* __WIN__*/ + + if (fblock_size > UNIV_PAGE_SIZE/2 || fblock_size < 512) { + fprintf(stderr, "InnoDB: Note: File system for file %s has " + "file block size %lu not supported for page_size %lu\n", + name, fblock_size, UNIV_PAGE_SIZE); + + if (fblock_size < 512) { + fblock_size = 512; + } else { + fblock_size = UNIV_PAGE_SIZE/2; + } + + fprintf(stderr, "InnoDB: Note: Using file block size %ld for file %s\n", + fblock_size, name); + } + + return fblock_size; +} |