diff options
Diffstat (limited to 'storage/innobase/os/os0file.c')
-rw-r--r-- | storage/innobase/os/os0file.c | 295 |
1 files changed, 188 insertions, 107 deletions
diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c index 6c17ded0073..bd48d0e4b6e 100644 --- a/storage/innobase/os/os0file.c +++ b/storage/innobase/os/os0file.c @@ -302,6 +302,36 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validates the consistency the aio system some of the time. +@return TRUE if ok or the check was skipped */ +UNIV_INTERN +ibool +os_aio_validate_skip(void) +/*======================*/ +{ +/** Try os_aio_validate() every this many times */ +# define OS_AIO_VALIDATE_SKIP 13 + + /** The os_aio_validate() call skip counter. + Use a signed type because of the race condition below. */ + static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly os_aio_validate() + check in debug builds. */ + if (--os_aio_validate_count > 0) { + return(TRUE); + } + + os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + return(os_aio_validate()); +} +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ /***********************************************************************//** Gets the operating system version. Currently works only on Windows. @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, @@ -311,7 +341,6 @@ ulint os_get_os_version(void) /*===================*/ { -#ifdef __WIN__ OSVERSIONINFO os_info; os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); @@ -340,12 +369,8 @@ os_get_os_version(void) ut_error; return(0); } -#else - ut_error; - - return(0); -#endif } +#endif /* __WIN__ */ /***********************************************************************//** Retrieves the last error number if an error occurs in a file io function. @@ -1295,10 +1320,12 @@ UNIV_INTERN void os_file_set_nocache( /*================*/ - int fd, /*!< in: file descriptor to alter */ - const char* file_name, /*!< in: file name, used in the - diagnostic message */ - const char* operation_name) /*!< in: "open" or "create"; used in the + int fd /*!< in: file descriptor to alter */ + __attribute__((unused)), + const char* file_name /*!< in: used in the diagnostic message */ + __attribute__((unused)), + const char* operation_name __attribute__((unused))) + /*!< in: "open" or "create"; used in the diagnostic message */ { /* some versions of Solaris may not have DIRECTIO_ON */ @@ -2398,7 +2425,10 @@ os_file_read_func( ulint i; #endif /* !UNIV_HOTBACKUP */ + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ ut_a((offset & 0xFFFFFFFFUL) == offset); + ut_a((n & 0xFFFFFFFFUL) == n); os_n_file_reads++; os_bytes_read_since_printout += n; @@ -2524,7 +2554,10 @@ os_file_read_no_error_handling_func( ulint i; #endif /* !UNIV_HOTBACKUP */ + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ ut_a((offset & 0xFFFFFFFFUL) == offset); + ut_a((n & 0xFFFFFFFFUL) == n); os_n_file_reads++; os_bytes_read_since_printout += n; @@ -2656,7 +2689,10 @@ os_file_write_func( ulint i; #endif /* !UNIV_HOTBACKUP */ - ut_a((offset & 0xFFFFFFFF) == offset); + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((offset & 0xFFFFFFFFUL) == offset); + ut_a((n & 0xFFFFFFFFUL) == n); os_n_file_writes++; @@ -3619,6 +3655,10 @@ os_aio_array_reserve_slot( ulint slots_per_seg; ulint local_seg; +#ifdef WIN_ASYNC_IO + ut_a((len & 0xFFFFFFFFUL) == len); +#endif + /* No need of a mutex. Only reading constant fields */ slots_per_seg = array->n_slots / array->n_segments; @@ -3993,7 +4033,10 @@ os_aio_func( ut_ad(n > 0); ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0); ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); - ut_ad(os_aio_validate()); + ut_ad(os_aio_validate_skip()); +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); @@ -4008,26 +4051,33 @@ os_aio_func( Windows async i/o, Windows does not allow us to use ordinary synchronous os_file_read etc. on the same file, therefore we have built a special mechanism for synchronous - wait in the Windows case. */ + wait in the Windows case. + Also note that the Performance Schema instrumentation has + been performed by current os_aio_func()'s wrapper function + pfs_os_aio_func(). So we would no longer need to call + Performance Schema instrumented os_file_read() and + os_file_write(). Instead, we should use os_file_read_func() + and os_file_write_func() */ if (type == OS_FILE_READ) { - return(os_file_read(file, buf, offset, + return(os_file_read_func(file, buf, offset, offset_high, n)); } ut_a(type == OS_FILE_WRITE); - return(os_file_write(name, file, buf, offset, offset_high, n)); + return(os_file_write_func(name, file, buf, offset, + offset_high, n)); } try_again: - if (mode == OS_AIO_NORMAL) { - if (type == OS_FILE_READ) { - array = os_aio_read_array; - } else { - array = os_aio_write_array; - } - } else if (mode == OS_AIO_IBUF) { + switch (mode) { + case OS_AIO_NORMAL: + array = (type == OS_FILE_READ) + ? os_aio_read_array + : os_aio_write_array; + break; + case OS_AIO_IBUF: ut_ad(type == OS_FILE_READ); /* Reduce probability of deadlock bugs in connection with ibuf: do not let the ibuf i/o handler sleep */ @@ -4035,19 +4085,21 @@ try_again: wake_later = FALSE; array = os_aio_ibuf_array; - } else if (mode == OS_AIO_LOG) { - + break; + case OS_AIO_LOG: array = os_aio_log_array; - } else if (mode == OS_AIO_SYNC) { + break; + case OS_AIO_SYNC: array = os_aio_sync_array; #if defined(LINUX_NATIVE_AIO) /* In Linux native AIO we don't use sync IO array. */ ut_a(!srv_use_native_aio); #endif /* LINUX_NATIVE_AIO */ - } else { - array = NULL; /* Eliminate compiler warning */ + break; + default: ut_error; + array = NULL; /* Eliminate compiler warning */ } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, @@ -4192,7 +4244,7 @@ os_aio_windows_handle( /* NOTE! We only access constant fields in os_aio_array. Therefore we do not have to acquire the protecting mutex yet */ - ut_ad(os_aio_validate()); + ut_ad(os_aio_validate_skip()); ut_ad(segment < array->n_segments); n = array->n_slots / array->n_segments; @@ -4210,11 +4262,17 @@ os_aio_windows_handle( INFINITE); } - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); + os_mutex_enter(array->mutex); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS + && array->n_reserved == 0) { + *message1 = NULL; + *message2 = NULL; + os_mutex_exit(array->mutex); + return(TRUE); } - os_mutex_enter(array->mutex); + ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n); slot = os_aio_array_get_nth_slot(array, i + segment * n); @@ -4269,16 +4327,18 @@ os_aio_windows_handle( __FILE__, __LINE__); #endif + ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); + switch (slot->type) { case OS_FILE_WRITE: ret = WriteFile(slot->file, slot->buf, - slot->len, &len, + (DWORD) slot->len, &len, &(slot->control)); break; case OS_FILE_READ: ret = ReadFile(slot->file, slot->buf, - slot->len, &len, + (DWORD) slot->len, &len, &(slot->control)); break; @@ -4358,14 +4418,6 @@ os_aio_linux_collect( retry: - /* Go down if we are in shutdown mode. - In case of srv_fast_shutdown == 2, there may be pending - IO requests but that should be OK as we essentially treat - that as a crash of InnoDB. */ - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); - } - /* Initialize the events. The timeout value is arbitrary. We probably need to experiment with it a little. */ memset(events, 0, sizeof(*events) * seg_size); @@ -4374,76 +4426,72 @@ retry: ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); - /* This error handling is for any error in collecting the - IO requests. The errors, if any, for any particular IO - request are simply passed on to the calling routine. */ + if (ret > 0) { + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; - /* Not enough resources! Try again. */ - if (ret == -EAGAIN) { - goto retry; - } + control = (struct iocb *)events[i].obj; + ut_a(control != NULL); - /* Interrupted! I have tested the behaviour in case of an - interrupt. If we have some completed IOs available then - the return code will be the number of IOs. We get EINTR only - if there are no completed IOs and we have been interrupted. */ - if (ret == -EINTR) { - goto retry; - } - - /* No pending request! Go back and check again. */ - if (ret == 0) { - goto retry; - } - - /* All other errors! should cause a trap for now. */ - if (UNIV_UNLIKELY(ret < 0)) { - ut_print_timestamp(stderr); - fprintf(stderr, - " InnoDB: unexpected ret_code[%d] from" - " io_getevents()!\n", ret); - ut_error; - } + slot = (os_aio_slot_t *) control->data; - ut_a(ret > 0); + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); - for (i = 0; i < ret; i++) { - os_aio_slot_t* slot; - struct iocb* control; - - control = (struct iocb *)events[i].obj; - ut_a(control != NULL); +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif - slot = (os_aio_slot_t *) control->data; + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); - /* Some sanity checks. */ - ut_a(slot != NULL); - ut_a(slot->reserved); + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); -#if defined(UNIV_AIO_DEBUG) - fprintf(stderr, - "io_getevents[%c]: slot[%p] ctx[%p]" - " seg[%lu]\n", - (slot->type == OS_FILE_WRITE) ? 'w' : 'r', - slot, io_ctx, segment); -#endif + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + return; + } - /* We are not scribbling previous segment. */ - ut_a(slot->pos >= start_pos); + if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + return; + } - /* We have not overstepped to next segment. */ - ut_a(slot->pos < end_pos); + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ - /* Mark this request as completed. The error handling - will be done in the calling function. */ - os_mutex_enter(array->mutex); - slot->n_bytes = events[i].res; - slot->ret = events[i].res2; - slot->io_already_done = TRUE; - os_mutex_exit(array->mutex); + switch (ret) { + case -EAGAIN: + /* Not enough resources! Try again. */ + case -EINTR: + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + case 0: + /* No pending request! Go back and check again. */ + goto retry; } - return; + /* All other errors should cause a trap for now. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from io_getevents()!\n", + ret); + ut_error; } /**********************************************************************//** @@ -4487,20 +4535,35 @@ os_aio_linux_handle( /* Loop until we have found a completed request. */ for (;;) { + ibool any_reserved = FALSE; os_mutex_enter(array->mutex); for (i = 0; i < n; ++i) { slot = os_aio_array_get_nth_slot( - array, i + segment * n); - if (slot->reserved && slot->io_already_done) { + array, i + segment * n); + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { /* Something for us to work on. */ goto found; + } else { + any_reserved = TRUE; } } os_mutex_exit(array->mutex); - /* We don't have any completed request. - Wait for some request. Note that we return + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + /* Wait for some request. Note that we return from wait iff we have found a request. */ srv_set_io_thread_op_info(global_seg, @@ -4596,6 +4659,7 @@ os_aio_simulated_handle( byte* combined_buf; byte* combined_buf2; ibool ret; + ibool any_reserved; ulint n; ulint i; @@ -4610,7 +4674,7 @@ restart: srv_set_io_thread_op_info(global_segment, "looking for i/o requests (a)"); - ut_ad(os_aio_validate()); + ut_ad(os_aio_validate_skip()); ut_ad(segment < array->n_segments); n = array->n_slots / array->n_segments; @@ -4626,18 +4690,21 @@ restart: goto recommended_sleep; } - os_mutex_enter(array->mutex); - srv_set_io_thread_op_info(global_segment, "looking for i/o requests (b)"); /* Check if there is a slot for which the i/o has already been done */ + any_reserved = FALSE; + + os_mutex_enter(array->mutex); for (i = 0; i < n; i++) { slot = os_aio_array_get_nth_slot(array, i + segment * n); - if (slot->reserved && slot->io_already_done) { + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { if (os_aio_print_debug) { fprintf(stderr, @@ -4649,9 +4716,23 @@ restart: ret = TRUE; goto slot_io_done; + } else { + any_reserved = TRUE; } } + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + os_mutex_exit(array->mutex); + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + n_consecutive = 0; /* If there are at least 2 seconds old requests, then pick the oldest |