summaryrefslogtreecommitdiff
path: root/storage/innobase/os/os0file.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/os/os0file.c')
-rw-r--r--storage/innobase/os/os0file.c295
1 files changed, 188 insertions, 107 deletions
diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c
index 6c17ded0073..bd48d0e4b6e 100644
--- a/storage/innobase/os/os0file.c
+++ b/storage/innobase/os/os0file.c
@@ -302,6 +302,36 @@ UNIV_INTERN ulint os_n_pending_writes = 0;
/** Number of pending read operations */
UNIV_INTERN ulint os_n_pending_reads = 0;
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validates the consistency the aio system some of the time.
+@return TRUE if ok or the check was skipped */
+UNIV_INTERN
+ibool
+os_aio_validate_skip(void)
+/*======================*/
+{
+/** Try os_aio_validate() every this many times */
+# define OS_AIO_VALIDATE_SKIP 13
+
+ /** The os_aio_validate() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly os_aio_validate()
+ check in debug builds. */
+ if (--os_aio_validate_count > 0) {
+ return(TRUE);
+ }
+
+ os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+ return(os_aio_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
/***********************************************************************//**
Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
@@ -311,7 +341,6 @@ ulint
os_get_os_version(void)
/*===================*/
{
-#ifdef __WIN__
OSVERSIONINFO os_info;
os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
@@ -340,12 +369,8 @@ os_get_os_version(void)
ut_error;
return(0);
}
-#else
- ut_error;
-
- return(0);
-#endif
}
+#endif /* __WIN__ */
/***********************************************************************//**
Retrieves the last error number if an error occurs in a file io function.
@@ -1295,10 +1320,12 @@ UNIV_INTERN
void
os_file_set_nocache(
/*================*/
- int fd, /*!< in: file descriptor to alter */
- const char* file_name, /*!< in: file name, used in the
- diagnostic message */
- const char* operation_name) /*!< in: "open" or "create"; used in the
+ int fd /*!< in: file descriptor to alter */
+ __attribute__((unused)),
+ const char* file_name /*!< in: used in the diagnostic message */
+ __attribute__((unused)),
+ const char* operation_name __attribute__((unused)))
+ /*!< in: "open" or "create"; used in the
diagnostic message */
{
/* some versions of Solaris may not have DIRECTIO_ON */
@@ -2398,7 +2425,10 @@ os_file_read_func(
ulint i;
#endif /* !UNIV_HOTBACKUP */
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
ut_a((offset & 0xFFFFFFFFUL) == offset);
+ ut_a((n & 0xFFFFFFFFUL) == n);
os_n_file_reads++;
os_bytes_read_since_printout += n;
@@ -2524,7 +2554,10 @@ os_file_read_no_error_handling_func(
ulint i;
#endif /* !UNIV_HOTBACKUP */
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
ut_a((offset & 0xFFFFFFFFUL) == offset);
+ ut_a((n & 0xFFFFFFFFUL) == n);
os_n_file_reads++;
os_bytes_read_since_printout += n;
@@ -2656,7 +2689,10 @@ os_file_write_func(
ulint i;
#endif /* !UNIV_HOTBACKUP */
- ut_a((offset & 0xFFFFFFFF) == offset);
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+ ut_a((n & 0xFFFFFFFFUL) == n);
os_n_file_writes++;
@@ -3619,6 +3655,10 @@ os_aio_array_reserve_slot(
ulint slots_per_seg;
ulint local_seg;
+#ifdef WIN_ASYNC_IO
+ ut_a((len & 0xFFFFFFFFUL) == len);
+#endif
+
/* No need of a mutex. Only reading constant fields */
slots_per_seg = array->n_slots / array->n_segments;
@@ -3993,7 +4033,10 @@ os_aio_func(
ut_ad(n > 0);
ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
- ut_ad(os_aio_validate());
+ ut_ad(os_aio_validate_skip());
+#ifdef WIN_ASYNC_IO
+ ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
@@ -4008,26 +4051,33 @@ os_aio_func(
Windows async i/o, Windows does not allow us to use
ordinary synchronous os_file_read etc. on the same file,
therefore we have built a special mechanism for synchronous
- wait in the Windows case. */
+ wait in the Windows case.
+ Also note that the Performance Schema instrumentation has
+ been performed by current os_aio_func()'s wrapper function
+ pfs_os_aio_func(). So we would no longer need to call
+ Performance Schema instrumented os_file_read() and
+ os_file_write(). Instead, we should use os_file_read_func()
+ and os_file_write_func() */
if (type == OS_FILE_READ) {
- return(os_file_read(file, buf, offset,
+ return(os_file_read_func(file, buf, offset,
offset_high, n));
}
ut_a(type == OS_FILE_WRITE);
- return(os_file_write(name, file, buf, offset, offset_high, n));
+ return(os_file_write_func(name, file, buf, offset,
+ offset_high, n));
}
try_again:
- if (mode == OS_AIO_NORMAL) {
- if (type == OS_FILE_READ) {
- array = os_aio_read_array;
- } else {
- array = os_aio_write_array;
- }
- } else if (mode == OS_AIO_IBUF) {
+ switch (mode) {
+ case OS_AIO_NORMAL:
+ array = (type == OS_FILE_READ)
+ ? os_aio_read_array
+ : os_aio_write_array;
+ break;
+ case OS_AIO_IBUF:
ut_ad(type == OS_FILE_READ);
/* Reduce probability of deadlock bugs in connection with ibuf:
do not let the ibuf i/o handler sleep */
@@ -4035,19 +4085,21 @@ try_again:
wake_later = FALSE;
array = os_aio_ibuf_array;
- } else if (mode == OS_AIO_LOG) {
-
+ break;
+ case OS_AIO_LOG:
array = os_aio_log_array;
- } else if (mode == OS_AIO_SYNC) {
+ break;
+ case OS_AIO_SYNC:
array = os_aio_sync_array;
#if defined(LINUX_NATIVE_AIO)
/* In Linux native AIO we don't use sync IO array. */
ut_a(!srv_use_native_aio);
#endif /* LINUX_NATIVE_AIO */
- } else {
- array = NULL; /* Eliminate compiler warning */
+ break;
+ default:
ut_error;
+ array = NULL; /* Eliminate compiler warning */
}
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
@@ -4192,7 +4244,7 @@ os_aio_windows_handle(
/* NOTE! We only access constant fields in os_aio_array. Therefore
we do not have to acquire the protecting mutex yet */
- ut_ad(os_aio_validate());
+ ut_ad(os_aio_validate_skip());
ut_ad(segment < array->n_segments);
n = array->n_slots / array->n_segments;
@@ -4210,11 +4262,17 @@ os_aio_windows_handle(
INFINITE);
}
- if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
- os_thread_exit(NULL);
+ os_mutex_enter(array->mutex);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
+ && array->n_reserved == 0) {
+ *message1 = NULL;
+ *message2 = NULL;
+ os_mutex_exit(array->mutex);
+ return(TRUE);
}
- os_mutex_enter(array->mutex);
+ ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
slot = os_aio_array_get_nth_slot(array, i + segment * n);
@@ -4269,16 +4327,18 @@ os_aio_windows_handle(
__FILE__, __LINE__);
#endif
+ ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
+
switch (slot->type) {
case OS_FILE_WRITE:
ret = WriteFile(slot->file, slot->buf,
- slot->len, &len,
+ (DWORD) slot->len, &len,
&(slot->control));
break;
case OS_FILE_READ:
ret = ReadFile(slot->file, slot->buf,
- slot->len, &len,
+ (DWORD) slot->len, &len,
&(slot->control));
break;
@@ -4358,14 +4418,6 @@ os_aio_linux_collect(
retry:
- /* Go down if we are in shutdown mode.
- In case of srv_fast_shutdown == 2, there may be pending
- IO requests but that should be OK as we essentially treat
- that as a crash of InnoDB. */
- if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
- os_thread_exit(NULL);
- }
-
/* Initialize the events. The timeout value is arbitrary.
We probably need to experiment with it a little. */
memset(events, 0, sizeof(*events) * seg_size);
@@ -4374,76 +4426,72 @@ retry:
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
- /* This error handling is for any error in collecting the
- IO requests. The errors, if any, for any particular IO
- request are simply passed on to the calling routine. */
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ os_aio_slot_t* slot;
+ struct iocb* control;
- /* Not enough resources! Try again. */
- if (ret == -EAGAIN) {
- goto retry;
- }
+ control = (struct iocb *)events[i].obj;
+ ut_a(control != NULL);
- /* Interrupted! I have tested the behaviour in case of an
- interrupt. If we have some completed IOs available then
- the return code will be the number of IOs. We get EINTR only
- if there are no completed IOs and we have been interrupted. */
- if (ret == -EINTR) {
- goto retry;
- }
-
- /* No pending request! Go back and check again. */
- if (ret == 0) {
- goto retry;
- }
-
- /* All other errors! should cause a trap for now. */
- if (UNIV_UNLIKELY(ret < 0)) {
- ut_print_timestamp(stderr);
- fprintf(stderr,
- " InnoDB: unexpected ret_code[%d] from"
- " io_getevents()!\n", ret);
- ut_error;
- }
+ slot = (os_aio_slot_t *) control->data;
- ut_a(ret > 0);
+ /* Some sanity checks. */
+ ut_a(slot != NULL);
+ ut_a(slot->reserved);
- for (i = 0; i < ret; i++) {
- os_aio_slot_t* slot;
- struct iocb* control;
-
- control = (struct iocb *)events[i].obj;
- ut_a(control != NULL);
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_getevents[%c]: slot[%p] ctx[%p]"
+ " seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+ slot, io_ctx, segment);
+#endif
- slot = (os_aio_slot_t *) control->data;
+ /* We are not scribbling previous segment. */
+ ut_a(slot->pos >= start_pos);
- /* Some sanity checks. */
- ut_a(slot != NULL);
- ut_a(slot->reserved);
+ /* We have not overstepped to next segment. */
+ ut_a(slot->pos < end_pos);
-#if defined(UNIV_AIO_DEBUG)
- fprintf(stderr,
- "io_getevents[%c]: slot[%p] ctx[%p]"
- " seg[%lu]\n",
- (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
- slot, io_ctx, segment);
-#endif
+ /* Mark this request as completed. The error handling
+ will be done in the calling function. */
+ os_mutex_enter(array->mutex);
+ slot->n_bytes = events[i].res;
+ slot->ret = events[i].res2;
+ slot->io_already_done = TRUE;
+ os_mutex_exit(array->mutex);
+ }
+ return;
+ }
- /* We are not scribbling previous segment. */
- ut_a(slot->pos >= start_pos);
+ if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ return;
+ }
- /* We have not overstepped to next segment. */
- ut_a(slot->pos < end_pos);
+ /* This error handling is for any error in collecting the
+ IO requests. The errors, if any, for any particular IO
+ request are simply passed on to the calling routine. */
- /* Mark this request as completed. The error handling
- will be done in the calling function. */
- os_mutex_enter(array->mutex);
- slot->n_bytes = events[i].res;
- slot->ret = events[i].res2;
- slot->io_already_done = TRUE;
- os_mutex_exit(array->mutex);
+ switch (ret) {
+ case -EAGAIN:
+ /* Not enough resources! Try again. */
+ case -EINTR:
+ /* Interrupted! I have tested the behaviour in case of an
+ interrupt. If we have some completed IOs available then
+ the return code will be the number of IOs. We get EINTR only
+ if there are no completed IOs and we have been interrupted. */
+ case 0:
+ /* No pending request! Go back and check again. */
+ goto retry;
}
- return;
+ /* All other errors should cause a trap for now. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+ ret);
+ ut_error;
}
/**********************************************************************//**
@@ -4487,20 +4535,35 @@ os_aio_linux_handle(
/* Loop until we have found a completed request. */
for (;;) {
+ ibool any_reserved = FALSE;
os_mutex_enter(array->mutex);
for (i = 0; i < n; ++i) {
slot = os_aio_array_get_nth_slot(
- array, i + segment * n);
- if (slot->reserved && slot->io_already_done) {
+ array, i + segment * n);
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
/* Something for us to work on. */
goto found;
+ } else {
+ any_reserved = TRUE;
}
}
os_mutex_exit(array->mutex);
- /* We don't have any completed request.
- Wait for some request. Note that we return
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (UNIV_UNLIKELY
+ (!any_reserved
+ && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
+ /* Wait for some request. Note that we return
from wait iff we have found a request. */
srv_set_io_thread_op_info(global_seg,
@@ -4596,6 +4659,7 @@ os_aio_simulated_handle(
byte* combined_buf;
byte* combined_buf2;
ibool ret;
+ ibool any_reserved;
ulint n;
ulint i;
@@ -4610,7 +4674,7 @@ restart:
srv_set_io_thread_op_info(global_segment,
"looking for i/o requests (a)");
- ut_ad(os_aio_validate());
+ ut_ad(os_aio_validate_skip());
ut_ad(segment < array->n_segments);
n = array->n_slots / array->n_segments;
@@ -4626,18 +4690,21 @@ restart:
goto recommended_sleep;
}
- os_mutex_enter(array->mutex);
-
srv_set_io_thread_op_info(global_segment,
"looking for i/o requests (b)");
/* Check if there is a slot for which the i/o has already been
done */
+ any_reserved = FALSE;
+
+ os_mutex_enter(array->mutex);
for (i = 0; i < n; i++) {
slot = os_aio_array_get_nth_slot(array, i + segment * n);
- if (slot->reserved && slot->io_already_done) {
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
if (os_aio_print_debug) {
fprintf(stderr,
@@ -4649,9 +4716,23 @@ restart:
ret = TRUE;
goto slot_io_done;
+ } else {
+ any_reserved = TRUE;
}
}
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (UNIV_UNLIKELY
+ (!any_reserved
+ && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ os_mutex_exit(array->mutex);
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
n_consecutive = 0;
/* If there are at least 2 seconds old requests, then pick the oldest