diff options
Diffstat (limited to 'innobase/os/os0file.c')
-rw-r--r-- | innobase/os/os0file.c | 383 |
1 files changed, 330 insertions, 53 deletions
diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index e401503c4e3..e31fd1d9efe 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -60,6 +60,7 @@ struct os_aio_slot_struct{ ulint pos; /* index of the slot in the aio array */ ibool reserved; /* TRUE if this slot is reserved */ + time_t reservation_time;/* time when reserved */ ulint len; /* length of the block to read or write */ byte* buf; /* buffer used in i/o */ @@ -147,6 +148,12 @@ time_t os_last_printout; ibool os_has_said_disk_full = FALSE; +/* The mutex protecting the following counts of pending pread and pwrite +operations */ +os_mutex_t os_file_count_mutex; +ulint os_file_n_pending_preads = 0; +ulint os_file_n_pending_pwrites = 0; + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -204,9 +211,9 @@ os_file_get_last_error(void) if (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %li in a file operation.\n" + " InnoDB: Operating system error number %lu in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", - (long) err); + err); if (err == ERROR_PATH_NOT_FOUND) { fprintf(stderr, @@ -248,9 +255,9 @@ os_file_get_last_error(void) ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Operating system error number %li in a file operation.\n" + " InnoDB: Operating system error number %lu in a file operation.\n" "InnoDB: See http://www.innodb.com/ibman.html for installation help.\n", - (long) err); + err); if (err == ENOENT) { fprintf(stderr, @@ -344,7 +351,8 @@ os_file_handle_error( fprintf(stderr, "InnoDB: File name %s\n", name); } - fprintf(stderr, "InnoDB: System call %s.\n", operation); + fprintf(stderr, "InnoDB: File operation call: '%s'.\n", + operation); fprintf(stderr, "InnoDB: Cannot continue operation.\n"); fflush(stderr); @@ -364,6 +372,8 @@ os_io_init_simple(void) { ulint i; + os_file_count_mutex = os_mutex_create(NULL); + for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { os_file_seek_mutexes[i] = os_mutex_create(NULL); } @@ -415,9 +425,8 @@ try_again: file = CreateFile(name, access, - FILE_SHARE_READ | FILE_SHARE_WRITE, - /* file can be read and written - also by other processes */ + FILE_SHARE_READ,/* file can be read also by other + processes */ NULL, /* default security attributes */ create_flag, attributes, @@ -481,6 +490,101 @@ try_again: return(file); #endif } + +/******************************************************************** +A simple function to open or create a file. */ + +os_file_t +os_file_create_simple_no_error_handling( +/*====================================*/ + /* out, own: handle to the file, not defined if error, + error number can be retrieved with os_get_last_error */ + char* name, /* in: name of the file or path as a null-terminated + string */ + ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened + (if does not exist, error), or OS_FILE_CREATE if a new + file is created (if exists, error) */ + ulint access_type,/* in: OS_FILE_READ_ONLY or OS_FILE_READ_WRITE */ + ibool* success)/* out: TRUE if succeed, FALSE if error */ +{ +#ifdef __WIN__ + os_file_t file; + DWORD create_flag; + DWORD access; + DWORD attributes = 0; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + create_flag = 0; + ut_error; + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + access = 0; + ut_error; + } + + file = CreateFile(name, + access, + FILE_SHARE_READ,/* file can be read also by other + processes */ + NULL, /* default security attributes */ + create_flag, + attributes, + NULL); /* no template file */ + + if (file == INVALID_HANDLE_VALUE) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#else + os_file_t file; + int create_flag; + + ut_a(name); + + if (create_mode == OS_FILE_OPEN) { + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + } else if (create_mode == OS_FILE_CREATE) { + create_flag = O_RDWR | O_CREAT | O_EXCL; + } else { + create_flag = 0; + ut_error; + } + + if (create_mode == OS_FILE_CREATE) { + file = open(name, create_flag, S_IRUSR | S_IWUSR + | S_IRGRP | S_IWGRP); + } else { + file = open(name, create_flag); + } + + if (file == -1) { + *success = FALSE; + } else { + *success = TRUE; + } + + return(file); +#endif +} + /******************************************************************** Opens an existing file or creates a new. */ @@ -496,7 +600,11 @@ os_file_create( file is created (if exists, error), OS_FILE_OVERWRITE if a new is created or an old overwritten */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o - is desired, OS_FILE_NORMAL, if any normal file */ + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. and srv_.. + variables whether we really use async i/o or + unbuffered i/o: look in the function source code for + the exact rules */ ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success)/* out: TRUE if succeed, FALSE if error */ { @@ -521,8 +629,8 @@ try_again: } if (purpose == OS_FILE_AIO) { - /* use asynchronous (overlapped) io and no buffering - of writes in the OS */ + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ attributes = 0; #ifdef WIN_ASYNC_IO if (os_aio_use_native_aio) { @@ -530,17 +638,13 @@ try_again: } #endif #ifdef UNIV_NON_BUFFERED_IO - if (type == OS_LOG_FILE) { + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { /* Do not use unbuffered i/o to log files because - to allow group commit to work when MySQL binlogging - is used we must separate log file write and log - file flush to disk. */ - } else { - if (srv_win_file_flush_method == - SRV_WIN_IO_UNBUFFERED) { - attributes = attributes - | FILE_FLAG_NO_BUFFERING; - } + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + } else if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; } #endif } else if (purpose == OS_FILE_NORMAL) { @@ -550,12 +654,9 @@ try_again: /* Do not use unbuffered i/o to log files because value 2 denotes that we do not flush the log at every commit, but only once per second */ - } else { - if (srv_win_file_flush_method == - SRV_WIN_IO_UNBUFFERED) { - attributes = attributes - | FILE_FLAG_NO_BUFFERING; - } + } else if (srv_win_file_flush_method == + SRV_WIN_IO_UNBUFFERED) { + attributes = attributes | FILE_FLAG_NO_BUFFERING; } #endif } else { @@ -566,9 +667,14 @@ try_again: file = CreateFile(name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ - FILE_SHARE_READ | FILE_SHARE_WRITE, - /* file can be read and written - also by other processes */ + FILE_SHARE_READ,/* File can be read also by other + processes; we must give the read + permission because of ibbackup. We do + not give the write permission to + others because if one would succeed to + start 2 instances of mysqld on the + SAME files, that could cause severe + database corruption! */ NULL, /* default security attributes */ create_flag, attributes, @@ -592,30 +698,70 @@ try_again: os_file_t file; int create_flag; ibool retry; + const char* mode_str = NULL; + const char* type_str = NULL; + const char* purpose_str = NULL; try_again: ut_a(name); if (create_mode == OS_FILE_OPEN) { + mode_str = "OPEN"; + create_flag = O_RDWR; } else if (create_mode == OS_FILE_CREATE) { + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; } else if (create_mode == OS_FILE_OVERWRITE) { + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; } else { create_flag = 0; ut_error; } - UT_NOT_USED(purpose); + if (type == OS_LOG_FILE) { + type_str = "LOG"; + } else if (type == OS_DATA_FILE) { + type_str = "DATA"; + } else { + ut_a(0); + } + + if (purpose == OS_FILE_AIO) { + purpose_str = "AIO"; + } else if (purpose == OS_FILE_NORMAL) { + purpose_str = "NORMAL"; + } else { + ut_a(0); + } +/* printf("Opening file %s, mode %s, type %s, purpose %s\n", + name, mode_str, type_str, purpose_str); */ #ifdef O_SYNC - if ((!srv_use_doublewrite_buf || type != OS_DATA_FILE) + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + if (type == OS_LOG_FILE && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { +/* printf("Using O_SYNC for file %s\n", name); */ + create_flag = create_flag | O_SYNC; } #endif +#ifdef O_DIRECT + /* We let O_DIRECT only affect data files */ + if (type != OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) { + +/* printf("Using O_DIRECT for file %s\n", name); */ + + create_flag = create_flag | O_DIRECT; + } +#endif if (create_mode == OS_FILE_CREATE) { file = open(name, create_flag, os_innodb_umask); } else { @@ -677,6 +823,41 @@ os_file_close( } /*************************************************************************** +Closes a file handle. */ + +ibool +os_file_close_no_error_handling( +/*============================*/ + /* out: TRUE if success */ + os_file_t file) /* in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** Gets a file size. */ ibool @@ -849,6 +1030,7 @@ os_file_flush( #ifdef HAVE_FDATASYNC ret = fdatasync(file); #else +/* printf("Flushing to file %lu\n", (ulint)file); */ ret = fsync(file); #endif os_n_fsyncs++; @@ -896,6 +1078,7 @@ os_file_pread( offset */ { off_t offs; + ssize_t n_bytes; ut_a((offset & 0xFFFFFFFF) == offset); @@ -917,7 +1100,17 @@ os_file_pread( os_n_file_reads++; #ifdef HAVE_PREAD - return(pread(file, buf, n, offs)); + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_mutex_exit(os_file_count_mutex); + + n_bytes = pread(file, buf, n, offs); + + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_mutex_exit(os_file_count_mutex); + + return(n_bytes); #else { ssize_t ret; @@ -982,8 +1175,16 @@ os_file_pwrite( os_n_file_writes++; #ifdef HAVE_PWRITE + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_mutex_exit(os_file_count_mutex); + ret = pwrite(file, buf, n, offs); + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_mutex_exit(os_file_count_mutex); + if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC && srv_unix_file_flush_method != SRV_UNIX_NOSYNC && !os_do_not_call_flush_at_each_write) { @@ -1157,6 +1358,7 @@ os_file_write( DWORD high; ulint i; ulint n_retries = 0; + ulint err; ut_a((offset & 0xFFFFFFFF) == offset); @@ -1224,18 +1426,27 @@ retry: if (!os_has_said_disk_full) { + err = (ulint)GetLastError(); + ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" "InnoDB: %lu bytes should have been written, only %lu were written.\n" "InnoDB: Operating system error number %lu.\n" -"InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" -"InnoDB: what the error number means.\n" "InnoDB: Check that your OS and file system support files of this size.\n" "InnoDB: Check also that the disk is not full or a disk quota exceeded.\n", name, offset_high, offset, n, (ulint)len, - (ulint)GetLastError()); + err); + + if (strerror((int)err) != NULL) { + fprintf(stderr, +"InnoDB: Error number %lu means '%s'.\n", err, strerror((int)err)); + } + + fprintf(stderr, +"InnoDB: See also section 13.2 at http://www.innodb.com/ibman.html\n" +"InnoDB: about operating system error numbers.\n"); os_has_said_disk_full = TRUE; } @@ -1259,12 +1470,19 @@ retry: " InnoDB: Error: Write to file %s failed at offset %lu %lu.\n" "InnoDB: %lu bytes should have been written, only %ld were written.\n" "InnoDB: Operating system error number %lu.\n" -"InnoDB: Look from section 13.2 at http://www.innodb.com/ibman.html\n" -"InnoDB: what the error number means or use the perror program of MySQL.\n" "InnoDB: Check that your OS and file system support files of this size.\n" "InnoDB: Check also that the disk is not full or a disk quota exceeded.\n", name, offset_high, offset, n, (long int)ret, (ulint)errno); + if (strerror(errno) != NULL) { + fprintf(stderr, +"InnoDB: Error number %lu means '%s'.\n", (ulint)errno, strerror(errno)); + } + + fprintf(stderr, +"InnoDB: See also section 13.2 at http://www.innodb.com/ibman.html\n" +"InnoDB: about operating system error numbers.\n"); + os_has_said_disk_full = TRUE; } @@ -1372,19 +1590,35 @@ os_aio_init( os_io_init_simple(); + for (i = 0; i < n_segments; i++) { + srv_io_thread_op_info[i] = (char*)"not started yet"; + } + n_per_seg = n / n_segments; n_write_segs = (n_segments - 2) / 2; n_read_segs = n_segments - 2 - n_write_segs; /* printf("Array n per seg %lu\n", n_per_seg); */ + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[0] = (char*)"insert buffer thread"; + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + srv_io_thread_function[1] = (char*)"log thread"; + os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, n_read_segs); + for (i = 2; i < 2 + n_read_segs; i++) { + srv_io_thread_function[i] = (char*)"read thread"; + } + os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, n_write_segs); - os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); - - os_aio_log_array = os_aio_array_create(n_per_seg, 1); + for (i = 2 + n_read_segs; i < n_segments; i++) { + srv_io_thread_function[i] = (char*)"write thread"; + } os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); @@ -1677,6 +1911,7 @@ loop: } slot->reserved = TRUE; + slot->reservation_time = time(NULL); slot->message1 = message1; slot->message2 = message2; slot->file = file; @@ -2249,6 +2484,8 @@ os_aio_simulated_handle( ulint total_len; ulint offs; ulint lowest_offset; + ulint biggest_age; + ulint age; byte* combined_buf; byte* combined_buf2; ibool ret; @@ -2301,22 +2538,55 @@ restart: n_consecutive = 0; - /* Look for an i/o request at the lowest offset in the array - (we ignore the high 32 bits of the offset in these heuristics) */ + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + biggest_age = 0; lowest_offset = ULINT_MAX; - + for (i = 0; i < n; i++) { slot = os_aio_array_get_nth_slot(array, i + segment * n); - if (slot->reserved && slot->offset < lowest_offset) { + if (slot->reserved) { + age = (ulint)difftime(time(NULL), + slot->reservation_time); - /* Found an i/o request */ - consecutive_ios[0] = slot; + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { - n_consecutive = 1; + /* Found an i/o request */ + consecutive_ios[0] = slot; - lowest_offset = slot->offset; + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = ULINT_MAX; + + for (i = 0; i < n; i++) { + slot = os_aio_array_get_nth_slot(array, + i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } } } @@ -2422,7 +2692,7 @@ consecutive_loop: + FIL_PAGE_LSN + 4) != mach_read_from_4(combined_buf + len2 + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN + 4)) { + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be written seems corrupt!\n"); @@ -2583,14 +2853,15 @@ os_aio_print( double avg_bytes_read; ulint i; - if (buf_end - buf < 1000) { + if (buf_end - buf < 1200) { return; } for (i = 0; i < srv_n_file_io_threads; i++) { - buf += sprintf(buf, "I/O thread %lu state: %s\n", i, - srv_io_thread_op_info[i]); + buf += sprintf(buf, "I/O thread %lu state: %s (%s)\n", i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); } buf += sprintf(buf, "Pending normal aio reads:"); @@ -2665,6 +2936,12 @@ loop: "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", os_n_file_reads, os_n_file_writes, os_n_fsyncs); + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + buf += sprintf(buf, + "%lu pending preads, %lu pending pwrites\n", + os_file_n_pending_preads, os_file_n_pending_pwrites); + } + if (os_n_file_reads == os_n_file_reads_old) { avg_bytes_read = 0.0; } else { |