summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorVladislav Vaintroub <wlad@montyprogram.com>2011-07-05 21:46:53 +0200
committerVladislav Vaintroub <wlad@montyprogram.com>2011-07-05 21:46:53 +0200
commitf9cb1467b84aba6cdc3178617def27c2993f016d (patch)
treeb415e273aafc2a6f0612b1f6cc55dbb0b0a06827 /storage
parent9e95a54793920ade348399a74a1e39ea3a27b635 (diff)
parentb48dc8306f2b729bef09f9cdf30d7897726b873e (diff)
downloadmariadb-git-f9cb1467b84aba6cdc3178617def27c2993f016d.tar.gz
merge Windows performance patches into 5.3
Diffstat (limited to 'storage')
-rw-r--r--storage/archive/ha_archive.cc6
-rw-r--r--storage/innobase/handler/ha_innodb.cc22
-rw-r--r--storage/innodb_plugin/handler/ha_innodb.cc25
-rw-r--r--storage/maria/ma_extra.c3
-rw-r--r--storage/myisam/mi_locking.c8
-rw-r--r--storage/xtradb/CMakeLists.txt15
-rw-r--r--storage/xtradb/handler/ha_innodb.cc25
-rw-r--r--storage/xtradb/include/os0file.h8
-rw-r--r--storage/xtradb/include/os0sync.h71
-rw-r--r--storage/xtradb/include/os0sync.ic9
-rw-r--r--storage/xtradb/include/srv0srv.h4
-rw-r--r--storage/xtradb/include/sync0sync.h2
-rw-r--r--storage/xtradb/os/os0file.c492
-rw-r--r--storage/xtradb/os/os0sync.c557
-rw-r--r--storage/xtradb/srv/srv0srv.c14
-rw-r--r--storage/xtradb/srv/srv0start.c31
16 files changed, 721 insertions, 571 deletions
diff --git a/storage/archive/ha_archive.cc b/storage/archive/ha_archive.cc
index 730d5b95abb..f7efaf4566f 100644
--- a/storage/archive/ha_archive.cc
+++ b/storage/archive/ha_archive.cc
@@ -684,11 +684,11 @@ int ha_archive::create(const char *name, TABLE *table_arg,
{
if (!my_fstat(frm_file, &file_stat, MYF(MY_WME)))
{
- frm_ptr= (uchar *)my_malloc(sizeof(uchar) * file_stat.st_size, MYF(0));
+ frm_ptr= (uchar *)my_malloc(sizeof(uchar) * (size_t)file_stat.st_size, MYF(0));
if (frm_ptr)
{
- my_read(frm_file, frm_ptr, file_stat.st_size, MYF(0));
- azwrite_frm(&create_stream, (char *)frm_ptr, file_stat.st_size);
+ my_read(frm_file, frm_ptr, (size_t)file_stat.st_size, MYF(0));
+ azwrite_frm(&create_stream, (char *)frm_ptr, (size_t)file_stat.st_size);
my_free((uchar*)frm_ptr, MYF(0));
}
}
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index ce264b1bbb4..eaf5ec4bed5 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1061,7 +1061,29 @@ innobase_mysql_tmpfile(void)
will be passed to fdopen(), it will be closed by invoking
fclose(), which in turn will invoke close() instead of
my_close(). */
+
+#ifdef _WIN32
+ /* Note that on Windows, the integer returned by mysql_tmpfile
+ has no relation to C runtime file descriptor. Here, we need
+ to call my_get_osfhandle to get the HANDLE and then convert it
+ to C runtime filedescriptor. */
+ {
+ HANDLE hFile = my_get_osfhandle(fd);
+ HANDLE hDup;
+ BOOL bOK =
+ DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
+ &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+ if(bOK) {
+ fd2 = _open_osfhandle((intptr_t)hDup,0);
+ }
+ else {
+ my_osmaperr(GetLastError());
+ fd2 = -1;
+ }
+ }
+#else
fd2 = dup(fd);
+#endif
if (fd2 < 0) {
DBUG_PRINT("error",("Got error %d on dup",fd2));
my_errno=errno;
diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc
index 38adab109fc..e470616d169 100644
--- a/storage/innodb_plugin/handler/ha_innodb.cc
+++ b/storage/innodb_plugin/handler/ha_innodb.cc
@@ -49,7 +49,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include <m_ctype.h>
#include <mysys_err.h>
#include <mysql/plugin.h>
-
+#ifdef _WIN32
+#include <io.h>
+#endif
/** @file ha_innodb.cc */
/* Include necessary InnoDB headers */
@@ -1172,7 +1174,28 @@ innobase_mysql_tmpfile(void)
will be passed to fdopen(), it will be closed by invoking
fclose(), which in turn will invoke close() instead of
my_close(). */
+#ifdef _WIN32
+ /* Note that on Windows, the integer returned by mysql_tmpfile
+ has no relation to C runtime file descriptor. Here, we need
+ to call my_get_osfhandle to get the HANDLE and then convert it
+ to C runtime filedescriptor. */
+ {
+ HANDLE hFile = my_get_osfhandle(fd);
+ HANDLE hDup;
+ BOOL bOK =
+ DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
+ &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+ if(bOK) {
+ fd2 = _open_osfhandle((intptr_t)hDup,0);
+ }
+ else {
+ my_osmaperr(GetLastError());
+ fd2 = -1;
+ }
+ }
+#else
fd2 = dup(fd);
+#endif
if (fd2 < 0) {
DBUG_PRINT("error",("Got error %d on dup",fd2));
my_errno=errno;
diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c
index a5c4c2ab251..68102b389fb 100644
--- a/storage/maria/ma_extra.c
+++ b/storage/maria/ma_extra.c
@@ -415,9 +415,8 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function,
if (!share->temporary)
error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
FLUSH_KEEP, FLUSH_KEEP);
-#ifdef HAVE_PREAD
+
_ma_decrement_open_count(info, 1);
-#endif
if (share->not_flushed)
{
share->not_flushed= 0;
diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c
index 97011831af8..17c1fc83f13 100644
--- a/storage/myisam/mi_locking.c
+++ b/storage/myisam/mi_locking.c
@@ -239,7 +239,7 @@ int mi_lock_database(MI_INFO *info, int lock_type)
break; /* Impossible */
}
}
-#ifdef __WIN__
+#ifdef _WIN32
else
{
/*
@@ -521,11 +521,11 @@ int _mi_writeinfo(register MI_INFO *info, uint operation)
share->state.update_count= info->last_loop= ++info->this_loop;
if ((error=mi_state_info_write(share->kfile, &share->state, 1)))
olderror=my_errno;
-#ifdef __WIN__
+#ifdef _WIN32
if (myisam_flush)
{
- _commit(share->kfile);
- _commit(info->dfile);
+ my_sync(share->kfile,0);
+ my_sync(info->dfile,0);
}
#endif
}
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index 509f7f0fe73..6e16c4ced32 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -15,20 +15,10 @@
# This is the CMakeLists for InnoDB Plugin
-
-
-# Starting at 5.1.38, MySQL CMake files are simplified. But the plugin
-# CMakeLists.txt still needs to work with previous versions of MySQL.
-IF (MYSQL_VERSION_ID GREATER "50137")
- INCLUDE("${PROJECT_SOURCE_DIR}/storage/mysql_storage_engine.cmake")
-ENDIF (MYSQL_VERSION_ID GREATER "50137")
-
IF (CMAKE_SIZEOF_VOID_P MATCHES 8)
SET(WIN64 TRUE)
ENDIF (CMAKE_SIZEOF_VOID_P MATCHES 8)
-ADD_DEFINITIONS(-D_WIN32 -D_LIB -DMYSQL_SERVER)
-
# Include directories under xtradb
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/xtradb/include
${CMAKE_SOURCE_DIR}/storage/xtradb/handler)
@@ -89,9 +79,6 @@ SET(XTRADB_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
usr/usr0sess.c
ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
ut/ut0list.c ut/ut0wqueue.c)
-# Windows atomics do not perform well. Disable Windows atomics by default.
-# See bug#52102 for details.
-#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
-ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
+
MYSQL_STORAGE_ENGINE(XTRADB)
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index f409b88cc7b..a511d764b24 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -56,7 +56,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include <m_ctype.h>
#include <mysys_err.h>
#include <mysql/plugin.h>
-
+#ifdef _WIN32
+#include <io.h>
+#endif
/** @file ha_innodb.cc */
/* Include necessary InnoDB headers */
@@ -1239,7 +1241,28 @@ innobase_mysql_tmpfile(void)
will be passed to fdopen(), it will be closed by invoking
fclose(), which in turn will invoke close() instead of
my_close(). */
+#ifdef _WIN32
+ /* Note that on Windows, the integer returned by mysql_tmpfile
+ has no relation to C runtime file descriptor. Here, we need
+ to call my_get_osfhandle to get the HANDLE and then convert it
+ to C runtime filedescriptor. */
+ {
+ HANDLE hFile = my_get_osfhandle(fd);
+ HANDLE hDup;
+ BOOL bOK =
+ DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
+ &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+ if(bOK) {
+ fd2 = _open_osfhandle((intptr_t)hDup,0);
+ }
+ else {
+ my_osmaperr(GetLastError());
+ fd2 = -1;
+ }
+ }
+#else
fd2 = dup(fd);
+#endif
if (fd2 < 0) {
DBUG_PRINT("error",("Got error %d on dup",fd2));
my_errno=errno;
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index 732e930517b..46bda4c6b45 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -152,8 +152,8 @@ log. */
#define OS_FILE_LOG 256 /* This can be ORed to type */
/* @} */
-#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more
- than 64 */
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 256 /*!< Windows might be able to handle
+more */
/** Modes for aio operations @{ */
#define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf
@@ -183,6 +183,10 @@ log. */
#define OS_WIN95 2 /*!< Microsoft Windows 95 */
#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */
#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */
+#define OS_WINXP 5 /*!< Microsoft Windows XP */
+#define OS_WINVISTA 6 /*!< Microsoft Windows Vista */
+#define OS_WIN7 7 /*!< Microsoft Windows 7 */
+
extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h
index 7366e2c3402..002abebcb0b 100644
--- a/storage/xtradb/include/os0sync.h
+++ b/storage/xtradb/include/os0sync.h
@@ -37,29 +37,19 @@ Created 9/6/1995 Heikki Tuuri
#include "univ.i"
#include "ut0lst.h"
-#ifdef __WIN__
-
+#ifdef _WIN32
+/** Native event (slow)*/
+typedef HANDLE os_native_event_t;
/** Native mutex */
-#define os_fast_mutex_t CRITICAL_SECTION
-
-/** Native event */
-typedef HANDLE os_native_event_t;
-
-/** Operating system event */
-typedef struct os_event_struct os_event_struct_t;
-/** Operating system event handle */
-typedef os_event_struct_t* os_event_t;
-
-/** An asynchronous signal sent between threads */
-struct os_event_struct {
- os_native_event_t handle;
- /*!< Windows event */
- UT_LIST_NODE_T(os_event_struct_t) os_event_list;
- /*!< list of all created events */
-};
+typedef CRITICAL_SECTION os_fast_mutex_t;
+/** Native condition variable */
+typedef CONDITION_VARIABLE os_cond_t;
#else
/** Native mutex */
typedef pthread_mutex_t os_fast_mutex_t;
+/** Native condition variable */
+typedef pthread_cond_t os_cond_t;
+#endif
/** Operating system event */
typedef struct os_event_struct os_event_struct_t;
@@ -68,6 +58,9 @@ typedef os_event_struct_t* os_event_t;
/** An asynchronous signal sent between threads */
struct os_event_struct {
+#ifdef _WIN32
+ HANDLE handle; /*!< kernel event object, slow, used on older Windows */
+#endif
os_fast_mutex_t os_mutex; /*!< this mutex protects the next
fields */
ibool is_set; /*!< this is TRUE when the event is
@@ -76,12 +69,14 @@ struct os_event_struct {
this event */
ib_int64_t signal_count; /*!< this is incremented each time
the event becomes signaled */
- pthread_cond_t cond_var; /*!< condition variable is used in
+ os_cond_t cond_var; /*!< condition variable is used in
waiting for the event */
UT_LIST_NODE_T(os_event_struct_t) os_event_list;
/*!< list of all created events */
};
-#endif
+
+
+
/** Operating system mutex */
typedef struct os_mutex_struct os_mutex_str_t;
@@ -186,33 +181,23 @@ os_event_wait_low(
os_event_reset(). */
#define os_event_wait(event) os_event_wait_low(event, 0)
-
+#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0)
/**********************************************************//**
Waits for an event object until it is in the signaled state or
-a timeout is exceeded.
+a timeout is exceeded. In Unix the timeout is always infinite.
@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
UNIV_INTERN
ulint
-os_event_wait_time(
-/*===============*/
- os_event_t event, /*!< in: event to wait */
- ulint wtime); /*!< in: timeout in microseconds, or
- OS_SYNC_INFINITE_TIME */
-#ifdef __WIN__
-/**********************************************************//**
-Waits for any event in an OS native event array. Returns if even a single
-one is signaled or becomes signaled.
-@return index of the event which was signaled */
-UNIV_INTERN
-ulint
-os_event_wait_multiple(
+os_event_wait_time_low(
/*===================*/
- ulint n, /*!< in: number of events in the
- array */
- os_native_event_t* native_event_array);
- /*!< in: pointer to an array of event
- handles */
-#endif
+ os_event_t event, /*!< in: event to wait */
+ ulint time_in_usec, /*!< in: timeout in
+ microseconds, or
+ OS_SYNC_INFINITE_TIME */
+ ib_int64_t reset_sig_count); /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
/*********************************************************//**
Creates an operating system mutex semaphore. Because these are slow, the
mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
@@ -385,7 +370,7 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */
# define os_atomic_test_and_set_byte(ptr, new_val) \
atomic_swap_uchar(ptr, new_val)
-#elif defined(HAVE_WINDOWS_ATOMICS)
+#elif defined(_WIN32)
#define HAVE_ATOMIC_BUILTINS
diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic
index 1f3ce38fa65..2c6c1dbe629 100644
--- a/storage/xtradb/include/os0sync.ic
+++ b/storage/xtradb/include/os0sync.ic
@@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri
#endif
/**********************************************************//**
-Acquires ownership of a fast mutex. Currently in Windows this is the same
-as os_fast_mutex_lock!
+Acquires ownership of a fast mutex.
@return 0 if success, != 0 if was reserved by another thread */
UNIV_INLINE
ulint
@@ -38,9 +37,9 @@ os_fast_mutex_trylock(
os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
{
#ifdef __WIN__
- EnterCriticalSection(fast_mutex);
-
- return(0);
+ if (TryEnterCriticalSection(fast_mutex))
+ return 0;
+ return(1);
#else
/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
so that it returns 0 on success. In the operating system
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index ac7ae8c5627..29d88331532 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -112,7 +112,9 @@ extern ulint srv_check_file_format_at_startup;
on duplicate key checking and foreign key checking */
extern ibool srv_locks_unsafe_for_binlog;
#endif /* !UNIV_HOTBACKUP */
-
+#ifdef __WIN__
+extern ibool srv_use_native_conditions;
+#endif
extern ulint srv_n_data_files;
extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
index f2ff83101ab..6aaab1cc7d7 100644
--- a/storage/xtradb/include/sync0sync.h
+++ b/storage/xtradb/include/sync0sync.h
@@ -45,7 +45,7 @@ Created 9/5/1995 Heikki Tuuri
extern my_bool timed_mutexes;
#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
-#ifdef HAVE_WINDOWS_ATOMICS
+#ifdef _WIN32
typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates
on LONG variable */
#else
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
index 5b8e656d8b2..7a8e5802b19 100644
--- a/storage/xtradb/os/os0file.c
+++ b/storage/xtradb/os/os0file.c
@@ -121,6 +121,12 @@ typedef struct os_aio_slot_struct os_aio_slot_t;
/** The asynchronous i/o array slot structure */
struct os_aio_slot_struct{
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED control; /*!< Windows control block for the
+ aio request, MUST be first element in the structure*/
+ void *arr; /*!< Array this slot belongs to*/
+#endif
+
ibool is_read; /*!< TRUE if a read operation */
ulint pos; /*!< index of the slot in the aio
array */
@@ -148,12 +154,6 @@ struct os_aio_slot_struct{
and which can be used to identify
which pending aio operation was
completed */
-#ifdef WIN_ASYNC_IO
- os_event_t event; /*!< event object we need in the
- OVERLAPPED struct */
- OVERLAPPED control; /*!< Windows control block for the
- aio request */
-#endif
};
/** The asynchronous i/o array structure */
@@ -182,15 +182,6 @@ struct os_aio_array_struct{
/*!< Number of reserved slots in the
aio array outside the ibuf segment */
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
-#ifdef __WIN__
- os_native_event_t* native_events;
- /*!< Pointer to an array of OS native
- event handles where we copied the
- handles from slots, in the same
- order. This can be used in
- WaitForMultipleObjects; used only in
- Windows */
-#endif
};
/** Array of events used in simulated aio */
@@ -250,6 +241,14 @@ UNIV_INTERN ulint os_n_pending_writes = 0;
/** Number of pending read operations */
UNIV_INTERN ulint os_n_pending_reads = 0;
+
+#ifdef _WIN32
+/** IO completion port used by background io threads */
+static HANDLE completion_port;
+/** Thread local storage index for the per-thread event used for synchronous IO */
+static DWORD tls_sync_io = TLS_OUT_OF_INDEXES;
+#endif
+
/***********************************************************************//**
Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
@@ -270,10 +269,16 @@ os_get_os_version(void)
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
return(OS_WIN95);
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
- if (os_info.dwMajorVersion <= 4) {
- return(OS_WINNT);
- } else {
- return(OS_WIN2000);
+ switch(os_info.dwMajorVersion){
+ case 3:
+ case 4:
+ return OS_WINNT;
+ case 5:
+ return (os_info.dwMinorVersion == 0)?OS_WIN2000 : OS_WINXP;
+ case 6:
+ return (os_info.dwMinorVersion == 0)?OS_WINVISTA : OS_WIN7;
+ default:
+ return OS_WIN7;
}
} else {
ut_error;
@@ -286,6 +291,86 @@ os_get_os_version(void)
#endif
}
+
+#ifdef _WIN32
+/*
+Windows : Handling synchronous IO on files opened asynchronously.
+
+If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
+a completion port, then every IO on this file would normally be enqueued to the
+completion port. Sometimes however we would like to do a synchronous IO. This is
+possible if we initialitze have overlapped.hEvent with a valid event and set its
+lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
+
+We'll create this special event once for each thread and store in thread local
+storage.
+*/
+
+
+/***********************************************************************//**
+Initialize tls index.for event handle used for synchronized IO on files that
+might be opened with FILE_FLAG_OVERLAPPED.
+*/
+static void win_init_syncio_event()
+{
+ tls_sync_io = TlsAlloc();
+ ut_a(tls_sync_io != TLS_OUT_OF_INDEXES);
+}
+
+/***********************************************************************//**
+Retrieve per-thread event for doing synchronous io on asyncronously opened files
+*/
+static HANDLE win_get_syncio_event()
+{
+ HANDLE h;
+ if(tls_sync_io == TLS_OUT_OF_INDEXES){
+ win_init_syncio_event();
+ }
+
+ h = (HANDLE)TlsGetValue(tls_sync_io);
+ if (h)
+ return h;
+ h = CreateEventA(NULL, FALSE, FALSE, NULL);
+ ut_a(h);
+ h = (HANDLE)((uintptr_t)h | 1);
+ TlsSetValue(tls_sync_io, h);
+ return h;
+}
+
+/*
+ TLS destructor, inspired by Chromium code
+ http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_win.cc
+*/
+
+static void win_free_syncio_event()
+{
+ HANDLE h = win_get_syncio_event();
+ if (h) {
+ CloseHandle(h);
+ }
+}
+
+static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) {
+ if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason)
+ win_free_syncio_event();
+}
+
+#ifdef _WIN64
+#pragma comment(linker, "/INCLUDE:_tls_used")
+#pragma comment(linker, "/INCLUDE:p_thread_callback_base")
+#pragma const_seg(".CRT$XLB")
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_base;
+const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
+#pragma data_seg()
+#else
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_base")
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
+#pragma data_seg()
+#endif
+#endif /*_WIN32 */
+
/***********************************************************************//**
Retrieves the last error number if an error occurs in a file io function.
The number should be retrieved before any other OS calls (because they may
@@ -611,6 +696,9 @@ os_io_init_simple(void)
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
os_file_seek_mutexes[i] = os_mutex_create(NULL);
}
+#ifdef _WIN32
+ win_init_syncio_event();
+#endif
}
/***********************************************************************//**
@@ -1358,6 +1446,16 @@ try_again:
ut_error;
}
+ if (type == OS_LOG_FILE) {
+ if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+ /* Map O_DSYNC to WRITE_THROUGH */
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ } else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+ /* Open log file without buffering */
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+ }
+
file = CreateFile((LPCTSTR) name,
GENERIC_READ | GENERIC_WRITE, /* read and write
access */
@@ -1402,6 +1500,9 @@ try_again:
}
} else {
*success = TRUE;
+ if (os_aio_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) {
+ ut_a(CreateIoCompletionPort(file, completion_port, 0, 0));
+ }
}
return(file);
@@ -2350,13 +2451,9 @@ _os_file_read(
#ifdef __WIN__
BOOL ret;
DWORD len;
- DWORD ret2;
- DWORD low;
- DWORD high;
ibool retry;
-#ifndef UNIV_HOTBACKUP
- ulint i;
-#endif /* !UNIV_HOTBACKUP */
+ OVERLAPPED overlapped;
+
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
no more than 32 bits. */
@@ -2371,41 +2468,21 @@ try_again:
ut_ad(buf);
ut_ad(n > 0);
- low = (DWORD) offset;
- high = (DWORD) offset_high;
-
os_mutex_enter(os_file_count_mutex);
os_n_pending_reads++;
os_mutex_exit(os_file_count_mutex);
-#ifndef UNIV_HOTBACKUP
- /* Protect the seek / read operation with a mutex */
- i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
-
- os_mutex_enter(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
-
- if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- os_mutex_enter(os_file_count_mutex);
- os_n_pending_reads--;
- os_mutex_exit(os_file_count_mutex);
-
- goto error_handling;
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)offset;
+ overlapped.OffsetHigh = (DWORD)offset_high;
+ overlapped.hEvent = win_get_syncio_event();
+ ret = ReadFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
}
-
- ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
+ else if(GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
+ }
os_mutex_enter(os_file_count_mutex);
os_n_pending_reads--;
os_mutex_exit(os_file_count_mutex);
@@ -2433,9 +2510,6 @@ try_again:
(ulong)n, (ulong)offset_high,
(ulong)offset, (long)ret);
#endif /* __WIN__ */
-#ifdef __WIN__
-error_handling:
-#endif
retry = os_file_handle_error(NULL, "read");
if (retry) {
@@ -2477,13 +2551,11 @@ os_file_read_no_error_handling(
#ifdef __WIN__
BOOL ret;
DWORD len;
- DWORD ret2;
- DWORD low;
- DWORD high;
ibool retry;
-#ifndef UNIV_HOTBACKUP
- ulint i;
-#endif /* !UNIV_HOTBACKUP */
+ OVERLAPPED overlapped;
+ overlapped.Offset = (DWORD)offset;
+ overlapped.OffsetHigh = (DWORD)offset_high;
+
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
no more than 32 bits. */
@@ -2498,41 +2570,21 @@ try_again:
ut_ad(buf);
ut_ad(n > 0);
- low = (DWORD) offset;
- high = (DWORD) offset_high;
-
os_mutex_enter(os_file_count_mutex);
os_n_pending_reads++;
os_mutex_exit(os_file_count_mutex);
-#ifndef UNIV_HOTBACKUP
- /* Protect the seek / read operation with a mutex */
- i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
-
- os_mutex_enter(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
-
- if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- os_mutex_enter(os_file_count_mutex);
- os_n_pending_reads--;
- os_mutex_exit(os_file_count_mutex);
-
- goto error_handling;
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)offset;
+ overlapped.OffsetHigh = (DWORD)offset_high;
+ overlapped.hEvent = win_get_syncio_event();
+ ret = ReadFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
}
-
- ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
+ else if(GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
+ }
os_mutex_enter(os_file_count_mutex);
os_n_pending_reads--;
os_mutex_exit(os_file_count_mutex);
@@ -2554,9 +2606,6 @@ try_again:
return(TRUE);
}
#endif /* __WIN__ */
-#ifdef __WIN__
-error_handling:
-#endif
retry = os_file_handle_error_no_exit(NULL, "read");
if (retry) {
@@ -2609,14 +2658,9 @@ os_file_write(
#ifdef __WIN__
BOOL ret;
DWORD len;
- DWORD ret2;
- DWORD low;
- DWORD high;
ulint n_retries = 0;
ulint err;
-#ifndef UNIV_HOTBACKUP
- ulint i;
-#endif /* !UNIV_HOTBACKUP */
+ OVERLAPPED overlapped;
/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
no more than 32 bits. */
@@ -2629,64 +2673,23 @@ os_file_write(
ut_ad(buf);
ut_ad(n > 0);
retry:
- low = (DWORD) offset;
- high = (DWORD) offset_high;
os_mutex_enter(os_file_count_mutex);
os_n_pending_writes++;
os_mutex_exit(os_file_count_mutex);
-#ifndef UNIV_HOTBACKUP
- /* Protect the seek / write operation with a mutex */
- i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
-
- os_mutex_enter(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
-
- if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
- os_mutex_enter(os_file_count_mutex);
- os_n_pending_writes--;
- os_mutex_exit(os_file_count_mutex);
-
- ut_print_timestamp(stderr);
-
- fprintf(stderr,
- " InnoDB: Error: File pointer positioning to"
- " file %s failed at\n"
- "InnoDB: offset %lu %lu. Operating system"
- " error number %lu.\n"
- "InnoDB: Some operating system error numbers"
- " are described at\n"
- "InnoDB: "
- REFMAN "operating-system-error-codes.html\n",
- name, (ulong) offset_high, (ulong) offset,
- (ulong) GetLastError());
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)offset;
+ overlapped.OffsetHigh = (DWORD)offset_high;
- return(FALSE);
+ overlapped.hEvent = win_get_syncio_event();
+ ret = WriteFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
}
-
- ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
-
- /* Always do fsync to reduce the probability that when the OS crashes,
- a database page is only partially physically written to disk. */
-
-# ifdef UNIV_DO_FLUSH
- if (!os_do_not_call_flush_at_each_write) {
- ut_a(TRUE == os_file_flush(file));
+ else if(GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
}
-# endif /* UNIV_DO_FLUSH */
-
-#ifndef UNIV_HOTBACKUP
- os_mutex_exit(os_file_seek_mutexes[i]);
-#endif /* !UNIV_HOTBACKUP */
-
os_mutex_enter(os_file_count_mutex);
os_n_pending_writes--;
os_mutex_exit(os_file_count_mutex);
@@ -3071,9 +3074,6 @@ os_aio_array_create(
os_aio_array_t* array;
ulint i;
os_aio_slot_t* slot;
-#ifdef WIN_ASYNC_IO
- OVERLAPPED* over;
-#endif
ut_a(n > 0);
ut_a(n_segments > 0);
@@ -3089,23 +3089,12 @@ os_aio_array_create(
array->n_segments = n_segments;
array->n_reserved = 0;
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
-#ifdef __WIN__
- array->native_events = ut_malloc(n * sizeof(os_native_event_t));
-#endif
+
for (i = 0; i < n; i++) {
slot = os_aio_array_get_nth_slot(array, i);
-
slot->pos = i;
slot->reserved = FALSE;
-#ifdef WIN_ASYNC_IO
- slot->event = os_event_create(NULL);
- over = &(slot->control);
-
- over->hEvent = slot->event->handle;
-
- *((array->native_events) + i) = over->hEvent;
-#endif
}
return(array);
@@ -3119,18 +3108,7 @@ os_aio_array_free(
/*==============*/
os_aio_array_t* array) /*!< in, own: array to free */
{
-#ifdef WIN_ASYNC_IO
- ulint i;
- for (i = 0; i < array->n_slots; i++) {
- os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
- os_event_free(slot->event);
- }
-#endif /* WIN_ASYNC_IO */
-
-#ifdef __WIN__
- ut_free(array->native_events);
-#endif /* __WIN__ */
os_mutex_free(array->mutex);
os_event_free(array->not_full);
os_event_free(array->is_empty);
@@ -3209,7 +3187,11 @@ os_aio_init(
}
os_last_printout = time(NULL);
-
+#ifdef _WIN32
+ ut_a(completion_port == 0);
+ completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+ ut_a(completion_port);
+#endif
}
/***********************************************************************
@@ -3251,11 +3233,11 @@ os_aio_array_wake_win_aio_at_shutdown(
/*==================================*/
os_aio_array_t* array) /*!< in: aio array */
{
- ulint i;
-
- for (i = 0; i < array->n_slots; i++) {
+ if(completion_port)
+ {
+ ut_a(CloseHandle(completion_port));
+ completion_port = 0;
- os_event_set((array->slots + i)->event);
}
}
#endif
@@ -3480,7 +3462,8 @@ found:
control = &(slot->control);
control->Offset = (DWORD)offset;
control->OffsetHigh = (DWORD)offset_high;
- os_event_reset(slot->event);
+ control->hEvent = 0;
+ slot->arr = array;
#endif
os_mutex_exit(array->mutex);
@@ -3517,9 +3500,6 @@ os_aio_array_free_slot(
os_event_set(array->is_empty);
}
-#ifdef WIN_ASYNC_IO
- os_event_reset(slot->event);
-#endif
os_mutex_exit(array->mutex);
}
@@ -3689,12 +3669,8 @@ os_aio(
os_aio_array_t* array;
os_aio_slot_t* slot;
#ifdef WIN_ASYNC_IO
- ibool retval;
- BOOL ret = TRUE;
DWORD len = (DWORD) n;
- struct fil_node_struct * dummy_mess1;
- void* dummy_mess2;
- ulint dummy_type;
+ BOOL ret;
#endif
ulint err = 0;
ibool retry;
@@ -3713,26 +3689,23 @@ os_aio(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
- if (mode == OS_AIO_SYNC
-#ifdef WIN_ASYNC_IO
- && !os_aio_use_native_aio
-#endif
- ) {
+ if (mode == OS_AIO_SYNC)
+ {
+ ibool ret;
/* This is actually an ordinary synchronous read or write:
- no need to use an i/o-handler thread. NOTE that if we use
- Windows async i/o, Windows does not allow us to use
- ordinary synchronous os_file_read etc. on the same file,
- therefore we have built a special mechanism for synchronous
- wait in the Windows case. */
+ no need to use an i/o-handler thread */
if (type == OS_FILE_READ) {
- return(_os_file_read(file, buf, offset,
- offset_high, n, trx));
+ ret = _os_file_read(file, buf, offset,
+ offset_high, n, trx);
}
+ else {
+ ut_a(type == OS_FILE_WRITE);
- ut_a(type == OS_FILE_WRITE);
-
- return(os_file_write(name, file, buf, offset, offset_high, n));
+ ret = os_file_write(name, file, buf, offset, offset_high, n);
+ }
+ ut_a(ret);
+ return ret;
}
try_again:
@@ -3775,6 +3748,8 @@ try_again:
ret = ReadFile(file, buf, (DWORD)n, &len,
&(slot->control));
+ if(!ret && GetLastError() != ERROR_IO_PENDING)
+ err = 1;
#endif
} else {
if (!wake_later) {
@@ -3789,6 +3764,8 @@ try_again:
os_n_file_writes++;
ret = WriteFile(file, buf, (DWORD)n, &len,
&(slot->control));
+ if(!ret && GetLastError() != ERROR_IO_PENDING)
+ err = 1;
#endif
} else {
if (!wake_later) {
@@ -3801,34 +3778,7 @@ try_again:
ut_error;
}
-#ifdef WIN_ASYNC_IO
- if (os_aio_use_native_aio) {
- if ((ret && len == n)
- || (!ret && GetLastError() == ERROR_IO_PENDING)) {
- /* aio was queued successfully! */
-
- if (mode == OS_AIO_SYNC) {
- /* We want a synchronous i/o operation on a
- file where we also use async i/o: in Windows
- we must use the same wait mechanism as for
- async i/o */
-
- retval = os_aio_windows_handle(ULINT_UNDEFINED,
- slot->pos,
- &dummy_mess1,
- &dummy_mess2,
- &dummy_type,
- &space_id);
-
- return(retval);
- }
-
- return(TRUE);
- }
- err = 1; /* Fall through the next if */
- }
-#endif
if (err == 0) {
/* aio was queued successfully! */
@@ -3881,52 +3831,20 @@ os_aio_windows_handle(
ulint* space_id)
{
ulint orig_seg = segment;
- os_aio_array_t* array;
os_aio_slot_t* slot;
- ulint n;
- ulint i;
ibool ret_val;
BOOL ret;
DWORD len;
BOOL retry = FALSE;
+ ULONG_PTR dummy_key;
- if (segment == ULINT_UNDEFINED) {
- array = os_aio_sync_array;
- segment = 0;
- } else {
- segment = os_aio_get_array_and_local_segment(&array, segment);
- }
-
- /* NOTE! We only access constant fields in os_aio_array. Therefore
- we do not have to acquire the protecting mutex yet */
-
- ut_ad(os_aio_validate());
- ut_ad(segment < array->n_segments);
-
- n = array->n_slots;
-
- if (array == os_aio_sync_array) {
- os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
- i = pos;
- } else {
- srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
- i = os_event_wait_multiple(n,
- (array->native_events)
- );
- }
-
- os_mutex_enter(array->mutex);
-
- slot = os_aio_array_get_nth_slot(array, i);
+ ret = GetQueuedCompletionStatus(completion_port, &len, &dummy_key,
+ (OVERLAPPED **)&slot, INFINITE);
- ut_a(slot->reserved);
-
- if (orig_seg != ULINT_UNDEFINED) {
- srv_set_io_thread_op_info(orig_seg,
- "get windows aio return value");
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_thread_exit(NULL);
}
- ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
*message1 = slot->message1;
*message2 = slot->message2;
@@ -3951,8 +3869,6 @@ os_aio_windows_handle(
ret_val = FALSE;
}
- os_mutex_exit(array->mutex);
-
if (retry) {
/* retry failed read/write operation synchronously.
No need to hold array->mutex. */
@@ -3961,37 +3877,19 @@ os_aio_windows_handle(
switch (slot->type) {
case OS_FILE_WRITE:
- ret = WriteFile(slot->file, slot->buf,
- (DWORD) slot->len, &len,
- &(slot->control));
-
+ ret_val = os_file_write(slot->name, slot->file, slot->buf,
+ slot->control.Offset, slot->control.OffsetHigh, slot->len);
break;
case OS_FILE_READ:
- ret = ReadFile(slot->file, slot->buf,
- (DWORD) slot->len, &len,
- &(slot->control));
-
+ ret_val = os_file_read(slot->file, slot->buf,
+ slot->control.Offset, slot->control.OffsetHigh, slot->len);
break;
default:
ut_error;
}
-
- if (!ret && GetLastError() == ERROR_IO_PENDING) {
- /* aio was queued successfully!
- We want a synchronous i/o operation on a
- file where we also use async i/o: in Windows
- we must use the same wait mechanism as for
- async i/o */
-
- ret = GetOverlappedResult(slot->file,
- &(slot->control),
- &len, TRUE);
- }
-
- ret_val = ret && len == slot->len;
}
- os_aio_array_free_slot(array, slot);
+ os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot);
return(ret_val);
}
diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.c
index dba997927cb..75bd6d44c2e 100644
--- a/storage/xtradb/os/os0sync.c
+++ b/storage/xtradb/os/os0sync.c
@@ -38,6 +38,7 @@ Created 9/6/1995 Heikki Tuuri
#include "ut0mem.h"
#include "srv0start.h"
+#include "srv0srv.h"
/* Type definition for an operating system mutex struct */
struct os_mutex_struct{
@@ -74,11 +75,225 @@ UNIV_INTERN ulint os_event_count = 0;
UNIV_INTERN ulint os_mutex_count = 0;
UNIV_INTERN ulint os_fast_mutex_count = 0;
+/* The number of microsecnds in a second. */
+static const ulint MICROSECS_IN_A_SECOND = 1000000;
+
/* Because a mutex is embedded inside an event and there is an
event embedded inside a mutex, on free, this generates a recursive call.
This version of the free event function doesn't acquire the global lock */
static void os_event_free_internal(os_event_t event);
+/* On Windows (Vista and later), load function pointers for condition
+variable handling. Those functions are not available in prior versions,
+so we have to use them via runtime loading, as long as we support XP. */
+static void os_cond_module_init(void);
+
+#ifdef __WIN__
+/* Prototypes and function pointers for condition variable functions */
+typedef VOID (WINAPI* InitializeConditionVariableProc)
+ (PCONDITION_VARIABLE ConditionVariable);
+static InitializeConditionVariableProc initialize_condition_variable;
+
+typedef BOOL (WINAPI* SleepConditionVariableCSProc)
+ (PCONDITION_VARIABLE ConditionVariable,
+ PCRITICAL_SECTION CriticalSection,
+ DWORD dwMilliseconds);
+static SleepConditionVariableCSProc sleep_condition_variable;
+
+typedef VOID (WINAPI* WakeAllConditionVariableProc)
+ (PCONDITION_VARIABLE ConditionVariable);
+static WakeAllConditionVariableProc wake_all_condition_variable;
+
+typedef VOID (WINAPI* WakeConditionVariableProc)
+ (PCONDITION_VARIABLE ConditionVariable);
+static WakeConditionVariableProc wake_condition_variable;
+#endif
+
+/*********************************************************//**
+Initialitze condition variable */
+UNIV_INLINE
+void
+os_cond_init(
+/*=========*/
+ os_cond_t* cond) /*!< in: condition variable. */
+{
+ ut_a(cond);
+
+#ifdef __WIN__
+ ut_a(initialize_condition_variable != NULL);
+ initialize_condition_variable(cond);
+#else
+ ut_a(pthread_cond_init(cond, NULL) == 0);
+#endif
+}
+
+/*********************************************************//**
+Do a timed wait on condition variable.
+@return TRUE if timed out, FALSE otherwise */
+UNIV_INLINE
+ibool
+os_cond_wait_timed(
+/*===============*/
+ os_cond_t* cond, /*!< in: condition variable. */
+ os_fast_mutex_t* mutex, /*!< in: fast mutex */
+#ifndef __WIN__
+ const struct timespec* abstime /*!< in: timeout */
+#else
+ DWORD time_in_ms /*!< in: timeout in
+ milliseconds*/
+#endif /* !__WIN__ */
+)
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD err;
+
+ ut_a(sleep_condition_variable != NULL);
+
+ ret = sleep_condition_variable(cond, mutex, time_in_ms);
+
+ if (!ret) {
+ err = GetLastError();
+ /* From http://msdn.microsoft.com/en-us/library/ms686301%28VS.85%29.aspx,
+ "Condition variables are subject to spurious wakeups
+ (those not associated with an explicit wake) and stolen wakeups
+ (another thread manages to run before the woken thread)."
+ Check for both types of timeouts.
+ Conditions are checked by the caller.*/
+ if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) {
+ return(TRUE);
+ }
+ }
+
+ ut_a(ret);
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = pthread_cond_timedwait(cond, mutex, abstime);
+
+ switch (ret) {
+ case 0:
+ case ETIMEDOUT:
+ /* We play it safe by checking for EINTR even though
+ according to the POSIX documentation it can't return EINTR. */
+ case EINTR:
+ break;
+
+ default:
+ fprintf(stderr, " InnoDB: pthread_cond_timedwait() returned: "
+ "%d: abstime={%lu,%lu}\n",
+ ret, (ulong) abstime->tv_sec, (ulong) abstime->tv_nsec);
+ ut_error;
+ }
+
+ return(ret == ETIMEDOUT);
+#endif
+}
+/*********************************************************//**
+Wait on condition variable */
+UNIV_INLINE
+void
+os_cond_wait(
+/*=========*/
+ os_cond_t* cond, /*!< in: condition variable. */
+ os_fast_mutex_t* mutex) /*!< in: fast mutex */
+{
+ ut_a(cond);
+ ut_a(mutex);
+
+#ifdef __WIN__
+ ut_a(sleep_condition_variable != NULL);
+ ut_a(sleep_condition_variable(cond, mutex, INFINITE));
+#else
+ ut_a(pthread_cond_wait(cond, mutex) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes all threads waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_broadcast(
+/*==============*/
+ os_cond_t* cond) /*!< in: condition variable. */
+{
+ ut_a(cond);
+
+#ifdef __WIN__
+ ut_a(wake_all_condition_variable != NULL);
+ wake_all_condition_variable(cond);
+#else
+ ut_a(pthread_cond_broadcast(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes one thread waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_signal(
+/*==========*/
+ os_cond_t* cond) /*!< in: condition variable. */
+{
+ ut_a(cond);
+
+#ifdef __WIN__
+ ut_a(wake_condition_variable != NULL);
+ wake_condition_variable(cond);
+#else
+ ut_a(pthread_cond_signal(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Destroys condition variable */
+UNIV_INLINE
+void
+os_cond_destroy(
+/*============*/
+ os_cond_t* cond) /*!< in: condition variable. */
+{
+#ifdef __WIN__
+ /* Do nothing */
+#else
+ ut_a(pthread_cond_destroy(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+On Windows (Vista and later), load function pointers for condition variable
+handling. Those functions are not available in prior versions, so we have to
+use them via runtime loading, as long as we support XP. */
+static
+void
+os_cond_module_init(void)
+/*=====================*/
+{
+#ifdef __WIN__
+ HMODULE h_dll;
+
+
+ h_dll = GetModuleHandle("kernel32");
+
+ initialize_condition_variable = (InitializeConditionVariableProc)
+ GetProcAddress(h_dll, "InitializeConditionVariable");
+ sleep_condition_variable = (SleepConditionVariableCSProc)
+ GetProcAddress(h_dll, "SleepConditionVariableCS");
+ wake_all_condition_variable = (WakeAllConditionVariableProc)
+ GetProcAddress(h_dll, "WakeAllConditionVariable");
+ wake_condition_variable = (WakeConditionVariableProc)
+ GetProcAddress(h_dll, "WakeConditionVariable");
+
+ /* When using native condition variables, check function pointers */
+ ut_a(initialize_condition_variable);
+ ut_a(sleep_condition_variable);
+ ut_a(wake_all_condition_variable);
+ ut_a(wake_condition_variable);
+#endif
+}
+
/*********************************************************//**
Initializes global event and OS 'slow' mutex lists. */
UNIV_INTERN
@@ -92,6 +307,9 @@ os_sync_init(void)
os_sync_mutex = NULL;
os_sync_mutex_inited = FALSE;
+ /* Now for Windows only */
+ os_cond_module_init();
+
os_sync_mutex = os_mutex_create(NULL);
os_sync_mutex_inited = TRUE;
@@ -146,42 +364,45 @@ os_event_create(
const char* name) /*!< in: the name of the event, if NULL
the event is created without a name */
{
-#ifdef __WIN__
- os_event_t event;
-
- event = ut_malloc(sizeof(struct os_event_struct));
-
- event->handle = CreateEvent(NULL, /* No security attributes */
- TRUE, /* Manual reset */
- FALSE, /* Initial state nonsignaled */
- (LPCTSTR) name);
- if (!event->handle) {
- fprintf(stderr,
- "InnoDB: Could not create a Windows event semaphore;"
- " Windows error %lu\n",
- (ulong) GetLastError());
- }
-#else /* Unix */
os_event_t event;
- UT_NOT_USED(name);
+#ifdef __WIN__
+ if(!srv_use_native_conditions) {
+
+ event = ut_malloc(sizeof(struct os_event_struct));
+
+ event->handle = CreateEvent(NULL,
+ TRUE,
+ FALSE,
+ (LPCTSTR) name);
+ if (!event->handle) {
+ fprintf(stderr,
+ "InnoDB: Could not create a Windows event"
+ " semaphore; Windows error %lu\n",
+ (ulong) GetLastError());
+ }
+ } else /* Windows with condition variables */
+#endif
- event = ut_malloc(sizeof(struct os_event_struct));
+ {
+ UT_NOT_USED(name);
- os_fast_mutex_init(&(event->os_mutex));
+ event = ut_malloc(sizeof(struct os_event_struct));
- ut_a(0 == pthread_cond_init(&(event->cond_var), NULL));
+ os_fast_mutex_init(&(event->os_mutex));
- event->is_set = FALSE;
+ os_cond_init(&(event->cond_var));
- /* We return this value in os_event_reset(), which can then be
- be used to pass to the os_event_wait_low(). The value of zero
- is reserved in os_event_wait_low() for the case when the
- caller does not want to pass any signal_count value. To
- distinguish between the two cases we initialize signal_count
- to 1 here. */
- event->signal_count = 1;
-#endif /* __WIN__ */
+ event->is_set = FALSE;
+
+ /* We return this value in os_event_reset(), which can then be
+ be used to pass to the os_event_wait_low(). The value of zero
+ is reserved in os_event_wait_low() for the case when the
+ caller does not want to pass any signal_count value. To
+ distinguish between the two cases we initialize signal_count
+ to 1 here. */
+ event->signal_count = 1;
+ }
/* The os_sync_mutex can be NULL because during startup an event
can be created [ because it's embedded in the mutex/rwlock ] before
@@ -211,10 +432,15 @@ os_event_set(
/*=========*/
os_event_t event) /*!< in: event to set */
{
-#ifdef __WIN__
ut_a(event);
- ut_a(SetEvent(event->handle));
-#else
+
+#ifdef __WIN__
+ if (!srv_use_native_conditions) {
+ ut_a(SetEvent(event->handle));
+ return;
+ }
+#endif
+
ut_a(event);
os_fast_mutex_lock(&(event->os_mutex));
@@ -224,11 +450,10 @@ os_event_set(
} else {
event->is_set = TRUE;
event->signal_count += 1;
- ut_a(0 == pthread_cond_broadcast(&(event->cond_var)));
+ os_cond_broadcast(&(event->cond_var));
}
os_fast_mutex_unlock(&(event->os_mutex));
-#endif
}
/**********************************************************//**
@@ -247,12 +472,14 @@ os_event_reset(
{
ib_int64_t ret = 0;
-#ifdef __WIN__
ut_a(event);
- ut_a(ResetEvent(event->handle));
-#else
- ut_a(event);
+#ifdef __WIN__
+ if(!srv_use_native_conditions) {
+ ut_a(ResetEvent(event->handle));
+ return(0);
+ }
+#endif
os_fast_mutex_lock(&(event->os_mutex));
@@ -264,7 +491,6 @@ os_event_reset(
ret = event->signal_count;
os_fast_mutex_unlock(&(event->os_mutex));
-#endif
return(ret);
}
@@ -277,17 +503,20 @@ os_event_free_internal(
os_event_t event) /*!< in: event to free */
{
#ifdef __WIN__
- ut_a(event);
+ if(!srv_use_native_conditions) {
+ ut_a(event);
+ ut_a(CloseHandle(event->handle));
+ } else
+#endif
+ {
+ ut_a(event);
- ut_a(CloseHandle(event->handle));
-#else
- ut_a(event);
+ /* This is to avoid freeing the mutex twice */
+ os_fast_mutex_free(&(event->os_mutex));
- /* This is to avoid freeing the mutex twice */
- os_fast_mutex_free(&(event->os_mutex));
+ os_cond_destroy(&(event->cond_var));
+ }
- ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
-#endif
/* Remove from the list of events */
UT_LIST_REMOVE(os_event_list, os_event_list, event);
@@ -306,16 +535,18 @@ os_event_free(
os_event_t event) /*!< in: event to free */
{
-#ifdef __WIN__
ut_a(event);
+#ifdef __WIN__
+ if(!srv_use_native_conditions){
+ ut_a(CloseHandle(event->handle));
+ } else /*Windows with condition variables */
+#endif
+ {
+ os_fast_mutex_free(&(event->os_mutex));
- ut_a(CloseHandle(event->handle));
-#else
- ut_a(event);
+ os_cond_destroy(&(event->cond_var));
+ }
- os_fast_mutex_free(&(event->os_mutex));
- ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
-#endif
/* Remove from the list of events */
os_mutex_enter(os_sync_mutex);
@@ -358,23 +589,24 @@ os_event_wait_low(
returned by previous call of
os_event_reset(). */
{
-#ifdef __WIN__
- DWORD err;
+
+ ib_int64_t old_signal_count;
- ut_a(event);
+#ifdef __WIN__
+ if(!srv_use_native_conditions) {
+ DWORD err;
- UT_NOT_USED(reset_sig_count);
+ ut_a(event);
- /* Specify an infinite time limit for waiting */
- err = WaitForSingleObject(event->handle, INFINITE);
+ UT_NOT_USED(reset_sig_count);
- ut_a(err == WAIT_OBJECT_0);
+ /* Specify an infinite wait */
+ err = WaitForSingleObject(event->handle, INFINITE);
- if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
- os_thread_exit(NULL);
+ ut_a(err == WAIT_OBJECT_0);
+ return;
}
-#else
- ib_int64_t old_signal_count;
+#endif
os_fast_mutex_lock(&(event->os_mutex));
@@ -399,13 +631,12 @@ os_event_wait_low(
return;
}
- pthread_cond_wait(&(event->cond_var), &(event->os_mutex));
+ os_cond_wait(&(event->cond_var), &(event->os_mutex));
/* Solaris manual said that spurious wakeups may occur: we
have to check if the event really has been signaled after
we came here to wait */
}
-#endif
}
/**********************************************************//**
@@ -414,112 +645,112 @@ a timeout is exceeded.
@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
UNIV_INTERN
ulint
-os_event_wait_time(
-/*===============*/
- os_event_t event, /*!< in: event to wait */
- ulint wtime) /*!< in: timeout in microseconds, or
- OS_SYNC_INFINITE_TIME */
+os_event_wait_time_low(
+/*===================*/
+ os_event_t event, /*!< in: event to wait */
+ ulint time_in_usec, /*!< in: timeout in
+ microseconds, or
+ OS_SYNC_INFINITE_TIME */
+ ib_int64_t reset_sig_count) /*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
{
+ ibool timed_out = FALSE;
+
#ifdef __WIN__
- DWORD err;
+ DWORD time_in_ms;
- ut_a(event);
+ if (!srv_use_native_conditions) {
+ DWORD err;
- if (wtime != OS_SYNC_INFINITE_TIME) {
- err = WaitForSingleObject(event->handle, (DWORD) wtime / 1000);
- } else {
- err = WaitForSingleObject(event->handle, INFINITE);
- }
+ ut_a(event);
- if (err == WAIT_OBJECT_0) {
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ time_in_ms = time_in_usec / 1000;
+ err = WaitForSingleObject(event->handle, time_in_ms);
+ } else {
+ err = WaitForSingleObject(event->handle, INFINITE);
+ }
- return(0);
- } else if (err == WAIT_TIMEOUT) {
+ if (err == WAIT_OBJECT_0) {
+ return(0);
+ } else if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) {
+ return(OS_SYNC_TIME_EXCEEDED);
+ }
- return(OS_SYNC_TIME_EXCEEDED);
- } else {
ut_error;
- return(1000000); /* dummy value to eliminate compiler warn. */
+ /* Dummy value to eliminate compiler warning. */
+ return(42);
+ } else {
+ ut_a(sleep_condition_variable != NULL);
+
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ time_in_ms = time_in_usec / 1000;
+ } else {
+ time_in_ms = INFINITE;
+ }
}
#else
- int err;
- int ret = 0;
- ulint tmp;
- ib_int64_t old_count;
- struct timeval tv_start;
- struct timespec timeout;
-
- if (wtime == OS_SYNC_INFINITE_TIME) {
- os_event_wait(event);
- return 0;
- }
+ struct timespec abstime;
- /* Compute the absolute point in time at which to time out. */
- gettimeofday(&tv_start, NULL);
- tmp = tv_start.tv_usec + wtime;
- timeout.tv_sec = tv_start.tv_sec + (tmp / 1000000);
- timeout.tv_nsec = (tmp % 1000000) * 1000;
+ if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+ struct timeval tv;
+ int ret;
+ ulint sec;
+ ulint usec;
- os_fast_mutex_lock(&(event->os_mutex));
- old_count = event->signal_count;
+ ret = ut_usectime(&sec, &usec);
+ ut_a(ret == 0);
- for (;;) {
- if (event->is_set == TRUE || event->signal_count != old_count)
- break;
+ tv.tv_sec = sec;
+ tv.tv_usec = usec;
- err = pthread_cond_timedwait(&(event->cond_var),
- &(event->os_mutex), &timeout);
- if (err == ETIMEDOUT) {
- ret = OS_SYNC_TIME_EXCEEDED;
- break;
+ tv.tv_usec += time_in_usec;
+
+ if ((ulint) tv.tv_usec >= MICROSECS_IN_A_SECOND) {
+ tv.tv_sec += time_in_usec / MICROSECS_IN_A_SECOND;
+ tv.tv_usec %= MICROSECS_IN_A_SECOND;
}
+
+ abstime.tv_sec = tv.tv_sec;
+ abstime.tv_nsec = tv.tv_usec * 1000;
+ } else {
+ abstime.tv_nsec = 999999999;
+ abstime.tv_sec = (time_t) ULINT_MAX;
}
- os_fast_mutex_unlock(&(event->os_mutex));
+ ut_a(abstime.tv_nsec <= 999999999);
+
+#endif /* __WIN__ */
- if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_fast_mutex_lock(&event->os_mutex);
- os_thread_exit(NULL);
+ if (!reset_sig_count) {
+ reset_sig_count = event->signal_count;
}
- return ret;
-#endif
-}
+ do {
+ if (event->is_set || event->signal_count != reset_sig_count) {
-#ifdef __WIN__
-/**********************************************************//**
-Waits for any event in an OS native event array. Returns if even a single
-one is signaled or becomes signaled.
-@return index of the event which was signaled */
-UNIV_INTERN
-ulint
-os_event_wait_multiple(
-/*===================*/
- ulint n, /*!< in: number of events in the
- array */
- os_native_event_t* native_event_array)
- /*!< in: pointer to an array of event
- handles */
-{
- DWORD index;
+ break;
+ }
- ut_a(native_event_array);
- ut_a(n > 0);
+ timed_out = os_cond_wait_timed(
+ &event->cond_var, &event->os_mutex,
+#ifndef __WIN__
+ &abstime
+#else
+ time_in_ms
+#endif /* !__WIN__ */
+ );
- index = WaitForMultipleObjects((DWORD) n, native_event_array,
- FALSE, /* Wait for any 1 event */
- INFINITE); /* Infinite wait time
- limit */
- ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparison */
- ut_a(index < WAIT_OBJECT_0 + n);
+ } while (!timed_out);
- if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
- os_thread_exit(NULL);
- }
+ os_fast_mutex_unlock(&event->os_mutex);
- return(index - WAIT_OBJECT_0);
+ return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0);
}
-#endif
/*********************************************************//**
Creates an operating system mutex semaphore. Because these are slow, the
@@ -532,15 +763,6 @@ os_mutex_create(
const char* name) /*!< in: the name of the mutex, if NULL
the mutex is created without a name */
{
-#ifdef __WIN__
- HANDLE mutex;
- os_mutex_t mutex_str;
-
- mutex = CreateMutex(NULL, /* No security attributes */
- FALSE, /* Initial state: no owner */
- (LPCTSTR) name);
- ut_a(mutex);
-#else
os_fast_mutex_t* mutex;
os_mutex_t mutex_str;
@@ -549,7 +771,6 @@ os_mutex_create(
mutex = ut_malloc(sizeof(os_fast_mutex_t));
os_fast_mutex_init(mutex);
-#endif
mutex_str = ut_malloc(sizeof(os_mutex_str_t));
mutex_str->handle = mutex;
@@ -580,25 +801,11 @@ os_mutex_enter(
/*===========*/
os_mutex_t mutex) /*!< in: mutex to acquire */
{
-#ifdef __WIN__
- DWORD err;
-
- ut_a(mutex);
-
- /* Specify infinite time limit for waiting */
- err = WaitForSingleObject(mutex->handle, INFINITE);
-
- ut_a(err == WAIT_OBJECT_0);
-
- (mutex->count)++;
- ut_a(mutex->count == 1);
-#else
os_fast_mutex_lock(mutex->handle);
(mutex->count)++;
ut_a(mutex->count == 1);
-#endif
}
/**********************************************************//**
@@ -614,11 +821,7 @@ os_mutex_exit(
ut_a(mutex->count == 1);
(mutex->count)--;
-#ifdef __WIN__
- ut_a(ReleaseMutex(mutex->handle));
-#else
os_fast_mutex_unlock(mutex->handle);
-#endif
}
/**********************************************************//**
@@ -647,15 +850,9 @@ os_mutex_free(
os_mutex_exit(os_sync_mutex);
}
-#ifdef __WIN__
- ut_a(CloseHandle(mutex->handle));
-
- ut_free(mutex);
-#else
os_fast_mutex_free(mutex->handle);
ut_free(mutex->handle);
ut_free(mutex);
-#endif
}
/*********************************************************//**
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
index 1e078e1a6e9..9a142e1ca86 100644
--- a/storage/xtradb/srv/srv0srv.c
+++ b/storage/xtradb/srv/srv0srv.c
@@ -139,6 +139,20 @@ UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
/** Place locks to records only i.e. do not use next-key locking except
on duplicate key checking and foreign key checking */
UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE;
+#ifdef __WIN__
+/* Windows native condition variables. We use runtime loading / function
+pointers, because they are not available on Windows Server 2003 and
+Windows XP/2000.
+
+We use condition for events on Windows if possible, even if os_event
+resembles Windows kernel event object well API-wise. The reason is
+performance, kernel objects are heavyweights and WaitForSingleObject() is a
+performance killer causing calling thread to context switch. Besides, Innodb
+is preallocating large number (often millions) of os_events. With kernel event
+objects it takes a big chunk out of non-paged pool, which is better suited
+for tasks like IO than for storing idle event objects. */
+UNIV_INTERN ibool srv_use_native_conditions = FALSE;
+#endif /* __WIN__ */
UNIV_INTERN ulint srv_n_data_files = 0;
UNIV_INTERN char** srv_data_file_names = NULL;
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
index cef045d72e1..d002a1bb682 100644
--- a/storage/xtradb/srv/srv0start.c
+++ b/storage/xtradb/srv/srv0start.c
@@ -1265,6 +1265,7 @@ innobase_start_or_create_for_mysql(void)
case OS_WIN95:
case OS_WIN31:
case OS_WINNT:
+ srv_use_native_conditions = FALSE;
/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
and NT use simulated aio. In NT Windows provides async i/o,
but when run in conjunction with InnoDB Hot Backup, it seemed
@@ -1272,24 +1273,26 @@ innobase_start_or_create_for_mysql(void)
os_aio_use_native_aio = FALSE;
break;
- default:
- /* On Win 2000 and XP use async i/o */
- //os_aio_use_native_aio = TRUE;
- os_aio_use_native_aio = FALSE;
- fprintf(stderr,
- "InnoDB: Windows native async i/o is disabled as default.\n"
- "InnoDB: It is not applicable for the current"
- " multi io threads implementation.\n");
- break;
+
+ case OS_WIN2000:
+ case OS_WINXP:
+ /* On 2000 and XP, async IO is available, but no condition variables. */
+ os_aio_use_native_aio = TRUE;
+ srv_use_native_conditions = FALSE;
+ break;
+
+ default:
+ os_aio_use_native_aio = TRUE;
+ srv_use_native_conditions = TRUE;
}
#endif
+
if (srv_file_flush_method_str == NULL) {
/* These are the default options */
srv_unix_file_flush_method = SRV_UNIX_FSYNC;
srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-#ifndef __WIN__
} else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
srv_unix_file_flush_method = SRV_UNIX_FSYNC;
@@ -1307,7 +1310,7 @@ innobase_start_or_create_for_mysql(void)
} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
-#else
+#ifdef _WIN32
} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
os_aio_use_native_aio = FALSE;
@@ -1315,16 +1318,10 @@ innobase_start_or_create_for_mysql(void)
} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
os_aio_use_native_aio = FALSE;
-
} else if (0 == ut_strcmp(srv_file_flush_method_str,
"async_unbuffered")) {
srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
os_aio_use_native_aio = TRUE;
- srv_n_read_io_threads = srv_n_write_io_threads = 1;
- fprintf(stderr,
- "InnoDB: 'async_unbuffered' was detected as innodb_flush_method.\n"
- "InnoDB: Windows native async i/o is enabled.\n"
- "InnoDB: And io threads are restricted.\n");
#endif
} else {
fprintf(stderr,