diff options
author | Vladislav Vaintroub <wlad@montyprogram.com> | 2011-07-05 21:46:53 +0200 |
---|---|---|
committer | Vladislav Vaintroub <wlad@montyprogram.com> | 2011-07-05 21:46:53 +0200 |
commit | f9cb1467b84aba6cdc3178617def27c2993f016d (patch) | |
tree | b415e273aafc2a6f0612b1f6cc55dbb0b0a06827 /storage | |
parent | 9e95a54793920ade348399a74a1e39ea3a27b635 (diff) | |
parent | b48dc8306f2b729bef09f9cdf30d7897726b873e (diff) | |
download | mariadb-git-f9cb1467b84aba6cdc3178617def27c2993f016d.tar.gz |
merge Windows performance patches into 5.3
Diffstat (limited to 'storage')
-rw-r--r-- | storage/archive/ha_archive.cc | 6 | ||||
-rw-r--r-- | storage/innobase/handler/ha_innodb.cc | 22 | ||||
-rw-r--r-- | storage/innodb_plugin/handler/ha_innodb.cc | 25 | ||||
-rw-r--r-- | storage/maria/ma_extra.c | 3 | ||||
-rw-r--r-- | storage/myisam/mi_locking.c | 8 | ||||
-rw-r--r-- | storage/xtradb/CMakeLists.txt | 15 | ||||
-rw-r--r-- | storage/xtradb/handler/ha_innodb.cc | 25 | ||||
-rw-r--r-- | storage/xtradb/include/os0file.h | 8 | ||||
-rw-r--r-- | storage/xtradb/include/os0sync.h | 71 | ||||
-rw-r--r-- | storage/xtradb/include/os0sync.ic | 9 | ||||
-rw-r--r-- | storage/xtradb/include/srv0srv.h | 4 | ||||
-rw-r--r-- | storage/xtradb/include/sync0sync.h | 2 | ||||
-rw-r--r-- | storage/xtradb/os/os0file.c | 492 | ||||
-rw-r--r-- | storage/xtradb/os/os0sync.c | 557 | ||||
-rw-r--r-- | storage/xtradb/srv/srv0srv.c | 14 | ||||
-rw-r--r-- | storage/xtradb/srv/srv0start.c | 31 |
16 files changed, 721 insertions, 571 deletions
diff --git a/storage/archive/ha_archive.cc b/storage/archive/ha_archive.cc index 730d5b95abb..f7efaf4566f 100644 --- a/storage/archive/ha_archive.cc +++ b/storage/archive/ha_archive.cc @@ -684,11 +684,11 @@ int ha_archive::create(const char *name, TABLE *table_arg, { if (!my_fstat(frm_file, &file_stat, MYF(MY_WME))) { - frm_ptr= (uchar *)my_malloc(sizeof(uchar) * file_stat.st_size, MYF(0)); + frm_ptr= (uchar *)my_malloc(sizeof(uchar) * (size_t)file_stat.st_size, MYF(0)); if (frm_ptr) { - my_read(frm_file, frm_ptr, file_stat.st_size, MYF(0)); - azwrite_frm(&create_stream, (char *)frm_ptr, file_stat.st_size); + my_read(frm_file, frm_ptr, (size_t)file_stat.st_size, MYF(0)); + azwrite_frm(&create_stream, (char *)frm_ptr, (size_t)file_stat.st_size); my_free((uchar*)frm_ptr, MYF(0)); } } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index ce264b1bbb4..eaf5ec4bed5 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1061,7 +1061,29 @@ innobase_mysql_tmpfile(void) will be passed to fdopen(), it will be closed by invoking fclose(), which in turn will invoke close() instead of my_close(). */ + +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = + DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if(bOK) { + fd2 = _open_osfhandle((intptr_t)hDup,0); + } + else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } +#else fd2 = dup(fd); +#endif if (fd2 < 0) { DBUG_PRINT("error",("Got error %d on dup",fd2)); my_errno=errno; diff --git a/storage/innodb_plugin/handler/ha_innodb.cc b/storage/innodb_plugin/handler/ha_innodb.cc index 38adab109fc..e470616d169 100644 --- a/storage/innodb_plugin/handler/ha_innodb.cc +++ b/storage/innodb_plugin/handler/ha_innodb.cc @@ -49,7 +49,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include <m_ctype.h> #include <mysys_err.h> #include <mysql/plugin.h> - +#ifdef _WIN32 +#include <io.h> +#endif /** @file ha_innodb.cc */ /* Include necessary InnoDB headers */ @@ -1172,7 +1174,28 @@ innobase_mysql_tmpfile(void) will be passed to fdopen(), it will be closed by invoking fclose(), which in turn will invoke close() instead of my_close(). */ +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = + DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if(bOK) { + fd2 = _open_osfhandle((intptr_t)hDup,0); + } + else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } +#else fd2 = dup(fd); +#endif if (fd2 < 0) { DBUG_PRINT("error",("Got error %d on dup",fd2)); my_errno=errno; diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c index a5c4c2ab251..68102b389fb 100644 --- a/storage/maria/ma_extra.c +++ b/storage/maria/ma_extra.c @@ -415,9 +415,8 @@ int maria_extra(MARIA_HA *info, enum ha_extra_function function, if (!share->temporary) error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, FLUSH_KEEP, FLUSH_KEEP); -#ifdef HAVE_PREAD + _ma_decrement_open_count(info, 1); -#endif if (share->not_flushed) { share->not_flushed= 0; diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c index 97011831af8..17c1fc83f13 100644 --- a/storage/myisam/mi_locking.c +++ b/storage/myisam/mi_locking.c @@ -239,7 +239,7 @@ int mi_lock_database(MI_INFO *info, int lock_type) break; /* Impossible */ } } -#ifdef __WIN__ +#ifdef _WIN32 else { /* @@ -521,11 +521,11 @@ int _mi_writeinfo(register MI_INFO *info, uint operation) share->state.update_count= info->last_loop= ++info->this_loop; if ((error=mi_state_info_write(share->kfile, &share->state, 1))) olderror=my_errno; -#ifdef __WIN__ +#ifdef _WIN32 if (myisam_flush) { - _commit(share->kfile); - _commit(info->dfile); + my_sync(share->kfile,0); + my_sync(info->dfile,0); } #endif } diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 509f7f0fe73..6e16c4ced32 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -15,20 +15,10 @@ # This is the CMakeLists for InnoDB Plugin - - -# Starting at 5.1.38, MySQL CMake files are simplified. But the plugin -# CMakeLists.txt still needs to work with previous versions of MySQL. -IF (MYSQL_VERSION_ID GREATER "50137") - INCLUDE("${PROJECT_SOURCE_DIR}/storage/mysql_storage_engine.cmake") -ENDIF (MYSQL_VERSION_ID GREATER "50137") - IF (CMAKE_SIZEOF_VOID_P MATCHES 8) SET(WIN64 TRUE) ENDIF (CMAKE_SIZEOF_VOID_P MATCHES 8) -ADD_DEFINITIONS(-D_WIN32 -D_LIB -DMYSQL_SERVER) - # Include directories under xtradb INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/xtradb/include ${CMAKE_SOURCE_DIR}/storage/xtradb/handler) @@ -89,9 +79,6 @@ SET(XTRADB_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c usr/usr0sess.c ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c ut/ut0list.c ut/ut0wqueue.c) -# Windows atomics do not perform well. Disable Windows atomics by default. -# See bug#52102 for details. -#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION) -ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION) + MYSQL_STORAGE_ENGINE(XTRADB) diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index f409b88cc7b..a511d764b24 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -56,7 +56,9 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include <m_ctype.h> #include <mysys_err.h> #include <mysql/plugin.h> - +#ifdef _WIN32 +#include <io.h> +#endif /** @file ha_innodb.cc */ /* Include necessary InnoDB headers */ @@ -1239,7 +1241,28 @@ innobase_mysql_tmpfile(void) will be passed to fdopen(), it will be closed by invoking fclose(), which in turn will invoke close() instead of my_close(). */ +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = + DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if(bOK) { + fd2 = _open_osfhandle((intptr_t)hDup,0); + } + else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } +#else fd2 = dup(fd); +#endif if (fd2 < 0) { DBUG_PRINT("error",("Got error %d on dup",fd2)); my_errno=errno; diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 732e930517b..46bda4c6b45 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -152,8 +152,8 @@ log. */ #define OS_FILE_LOG 256 /* This can be ORed to type */ /* @} */ -#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more - than 64 */ +#define OS_AIO_N_PENDING_IOS_PER_THREAD 256 /*!< Windows might be able to handle +more */ /** Modes for aio operations @{ */ #define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf @@ -183,6 +183,10 @@ log. */ #define OS_WIN95 2 /*!< Microsoft Windows 95 */ #define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */ #define OS_WIN2000 4 /*!< Microsoft Windows 2000 */ +#define OS_WINXP 5 /*!< Microsoft Windows XP */ +#define OS_WINVISTA 6 /*!< Microsoft Windows Vista */ +#define OS_WIN7 7 /*!< Microsoft Windows 7 */ + extern ulint os_n_file_reads; extern ulint os_n_file_writes; diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h index 7366e2c3402..002abebcb0b 100644 --- a/storage/xtradb/include/os0sync.h +++ b/storage/xtradb/include/os0sync.h @@ -37,29 +37,19 @@ Created 9/6/1995 Heikki Tuuri #include "univ.i" #include "ut0lst.h" -#ifdef __WIN__ - +#ifdef _WIN32 +/** Native event (slow)*/ +typedef HANDLE os_native_event_t; /** Native mutex */ -#define os_fast_mutex_t CRITICAL_SECTION - -/** Native event */ -typedef HANDLE os_native_event_t; - -/** Operating system event */ -typedef struct os_event_struct os_event_struct_t; -/** Operating system event handle */ -typedef os_event_struct_t* os_event_t; - -/** An asynchronous signal sent between threads */ -struct os_event_struct { - os_native_event_t handle; - /*!< Windows event */ - UT_LIST_NODE_T(os_event_struct_t) os_event_list; - /*!< list of all created events */ -}; +typedef CRITICAL_SECTION os_fast_mutex_t; +/** Native condition variable */ +typedef CONDITION_VARIABLE os_cond_t; #else /** Native mutex */ typedef pthread_mutex_t os_fast_mutex_t; +/** Native condition variable */ +typedef pthread_cond_t os_cond_t; +#endif /** Operating system event */ typedef struct os_event_struct os_event_struct_t; @@ -68,6 +58,9 @@ typedef os_event_struct_t* os_event_t; /** An asynchronous signal sent between threads */ struct os_event_struct { +#ifdef _WIN32 + HANDLE handle; /*!< kernel event object, slow, used on older Windows */ +#endif os_fast_mutex_t os_mutex; /*!< this mutex protects the next fields */ ibool is_set; /*!< this is TRUE when the event is @@ -76,12 +69,14 @@ struct os_event_struct { this event */ ib_int64_t signal_count; /*!< this is incremented each time the event becomes signaled */ - pthread_cond_t cond_var; /*!< condition variable is used in + os_cond_t cond_var; /*!< condition variable is used in waiting for the event */ UT_LIST_NODE_T(os_event_struct_t) os_event_list; /*!< list of all created events */ }; -#endif + + + /** Operating system mutex */ typedef struct os_mutex_struct os_mutex_str_t; @@ -186,33 +181,23 @@ os_event_wait_low( os_event_reset(). */ #define os_event_wait(event) os_event_wait_low(event, 0) - +#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0) /**********************************************************//** Waits for an event object until it is in the signaled state or -a timeout is exceeded. +a timeout is exceeded. In Unix the timeout is always infinite. @return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ UNIV_INTERN ulint -os_event_wait_time( -/*===============*/ - os_event_t event, /*!< in: event to wait */ - ulint wtime); /*!< in: timeout in microseconds, or - OS_SYNC_INFINITE_TIME */ -#ifdef __WIN__ -/**********************************************************//** -Waits for any event in an OS native event array. Returns if even a single -one is signaled or becomes signaled. -@return index of the event which was signaled */ -UNIV_INTERN -ulint -os_event_wait_multiple( +os_event_wait_time_low( /*===================*/ - ulint n, /*!< in: number of events in the - array */ - os_native_event_t* native_event_array); - /*!< in: pointer to an array of event - handles */ -#endif + os_event_t event, /*!< in: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + ib_int64_t reset_sig_count); /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the mutex semaphore of InnoDB itself (mutex_t) should be used where possible. @@ -385,7 +370,7 @@ Returns the old value of *ptr, atomically sets *ptr to new_val */ # define os_atomic_test_and_set_byte(ptr, new_val) \ atomic_swap_uchar(ptr, new_val) -#elif defined(HAVE_WINDOWS_ATOMICS) +#elif defined(_WIN32) #define HAVE_ATOMIC_BUILTINS diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic index 1f3ce38fa65..2c6c1dbe629 100644 --- a/storage/xtradb/include/os0sync.ic +++ b/storage/xtradb/include/os0sync.ic @@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri #endif /**********************************************************//** -Acquires ownership of a fast mutex. Currently in Windows this is the same -as os_fast_mutex_lock! +Acquires ownership of a fast mutex. @return 0 if success, != 0 if was reserved by another thread */ UNIV_INLINE ulint @@ -38,9 +37,9 @@ os_fast_mutex_trylock( os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ { #ifdef __WIN__ - EnterCriticalSection(fast_mutex); - - return(0); + if (TryEnterCriticalSection(fast_mutex)) + return 0; + return(1); #else /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock so that it returns 0 on success. In the operating system diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index ac7ae8c5627..29d88331532 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -112,7 +112,9 @@ extern ulint srv_check_file_format_at_startup; on duplicate key checking and foreign key checking */ extern ibool srv_locks_unsafe_for_binlog; #endif /* !UNIV_HOTBACKUP */ - +#ifdef __WIN__ +extern ibool srv_use_native_conditions; +#endif extern ulint srv_n_data_files; extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h index f2ff83101ab..6aaab1cc7d7 100644 --- a/storage/xtradb/include/sync0sync.h +++ b/storage/xtradb/include/sync0sync.h @@ -45,7 +45,7 @@ Created 9/5/1995 Heikki Tuuri extern my_bool timed_mutexes; #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ -#ifdef HAVE_WINDOWS_ATOMICS +#ifdef _WIN32 typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates on LONG variable */ #else diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c index 5b8e656d8b2..7a8e5802b19 100644 --- a/storage/xtradb/os/os0file.c +++ b/storage/xtradb/os/os0file.c @@ -121,6 +121,12 @@ typedef struct os_aio_slot_struct os_aio_slot_t; /** The asynchronous i/o array slot structure */ struct os_aio_slot_struct{ +#ifdef WIN_ASYNC_IO + OVERLAPPED control; /*!< Windows control block for the + aio request, MUST be first element in the structure*/ + void *arr; /*!< Array this slot belongs to*/ +#endif + ibool is_read; /*!< TRUE if a read operation */ ulint pos; /*!< index of the slot in the aio array */ @@ -148,12 +154,6 @@ struct os_aio_slot_struct{ and which can be used to identify which pending aio operation was completed */ -#ifdef WIN_ASYNC_IO - os_event_t event; /*!< event object we need in the - OVERLAPPED struct */ - OVERLAPPED control; /*!< Windows control block for the - aio request */ -#endif }; /** The asynchronous i/o array structure */ @@ -182,15 +182,6 @@ struct os_aio_array_struct{ /*!< Number of reserved slots in the aio array outside the ibuf segment */ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ -#ifdef __WIN__ - os_native_event_t* native_events; - /*!< Pointer to an array of OS native - event handles where we copied the - handles from slots, in the same - order. This can be used in - WaitForMultipleObjects; used only in - Windows */ -#endif }; /** Array of events used in simulated aio */ @@ -250,6 +241,14 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; + +#ifdef _WIN32 +/** IO completion port used by background io threads */ +static HANDLE completion_port; +/** Thread local storage index for the per-thread event used for synchronous IO */ +static DWORD tls_sync_io = TLS_OUT_OF_INDEXES; +#endif + /***********************************************************************//** Gets the operating system version. Currently works only on Windows. @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ @@ -270,10 +269,16 @@ os_get_os_version(void) } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { return(OS_WIN95); } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { - if (os_info.dwMajorVersion <= 4) { - return(OS_WINNT); - } else { - return(OS_WIN2000); + switch(os_info.dwMajorVersion){ + case 3: + case 4: + return OS_WINNT; + case 5: + return (os_info.dwMinorVersion == 0)?OS_WIN2000 : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0)?OS_WINVISTA : OS_WIN7; + default: + return OS_WIN7; } } else { ut_error; @@ -286,6 +291,86 @@ os_get_os_version(void) #endif } + +#ifdef _WIN32 +/* +Windows : Handling synchronous IO on files opened asynchronously. + +If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to +a completion port, then every IO on this file would normally be enqueued to the +completion port. Sometimes however we would like to do a synchronous IO. This is +possible if we initialitze have overlapped.hEvent with a valid event and set its +lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) + +We'll create this special event once for each thread and store in thread local +storage. +*/ + + +/***********************************************************************//** +Initialize tls index.for event handle used for synchronized IO on files that +might be opened with FILE_FLAG_OVERLAPPED. +*/ +static void win_init_syncio_event() +{ + tls_sync_io = TlsAlloc(); + ut_a(tls_sync_io != TLS_OUT_OF_INDEXES); +} + +/***********************************************************************//** +Retrieve per-thread event for doing synchronous io on asyncronously opened files +*/ +static HANDLE win_get_syncio_event() +{ + HANDLE h; + if(tls_sync_io == TLS_OUT_OF_INDEXES){ + win_init_syncio_event(); + } + + h = (HANDLE)TlsGetValue(tls_sync_io); + if (h) + return h; + h = CreateEventA(NULL, FALSE, FALSE, NULL); + ut_a(h); + h = (HANDLE)((uintptr_t)h | 1); + TlsSetValue(tls_sync_io, h); + return h; +} + +/* + TLS destructor, inspired by Chromium code + http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_win.cc +*/ + +static void win_free_syncio_event() +{ + HANDLE h = win_get_syncio_event(); + if (h) { + CloseHandle(h); + } +} + +static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) { + if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason) + win_free_syncio_event(); +} + +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#pragma comment(linker, "/INCLUDE:p_thread_callback_base") +#pragma const_seg(".CRT$XLB") +extern const PIMAGE_TLS_CALLBACK p_thread_callback_base; +const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_base") +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#endif +#endif /*_WIN32 */ + /***********************************************************************//** Retrieves the last error number if an error occurs in a file io function. The number should be retrieved before any other OS calls (because they may @@ -611,6 +696,9 @@ os_io_init_simple(void) for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { os_file_seek_mutexes[i] = os_mutex_create(NULL); } +#ifdef _WIN32 + win_init_syncio_event(); +#endif } /***********************************************************************//** @@ -1358,6 +1446,16 @@ try_again: ut_error; } + if (type == OS_LOG_FILE) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + /* Map O_DSYNC to WRITE_THROUGH */ + attributes |= FILE_FLAG_WRITE_THROUGH; + } else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + /* Open log file without buffering */ + attributes |= FILE_FLAG_NO_BUFFERING; + } + } + file = CreateFile((LPCTSTR) name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ @@ -1402,6 +1500,9 @@ try_again: } } else { *success = TRUE; + if (os_aio_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) { + ut_a(CreateIoCompletionPort(file, completion_port, 0, 0)); + } } return(file); @@ -2350,13 +2451,9 @@ _os_file_read( #ifdef __WIN__ BOOL ret; DWORD len; - DWORD ret2; - DWORD low; - DWORD high; ibool retry; -#ifndef UNIV_HOTBACKUP - ulint i; -#endif /* !UNIV_HOTBACKUP */ + OVERLAPPED overlapped; + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be no more than 32 bits. */ @@ -2371,41 +2468,21 @@ try_again: ut_ad(buf); ut_ad(n > 0); - low = (DWORD) offset; - high = (DWORD) offset_high; - os_mutex_enter(os_file_count_mutex); os_n_pending_reads++; os_mutex_exit(os_file_count_mutex); -#ifndef UNIV_HOTBACKUP - /* Protect the seek / read operation with a mutex */ - i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; - - os_mutex_enter(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); - - if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - os_mutex_enter(os_file_count_mutex); - os_n_pending_reads--; - os_mutex_exit(os_file_count_mutex); - - goto error_handling; + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)offset; + overlapped.OffsetHigh = (DWORD)offset_high; + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); } - - ret = ReadFile(file, buf, (DWORD) n, &len, NULL); - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } os_mutex_enter(os_file_count_mutex); os_n_pending_reads--; os_mutex_exit(os_file_count_mutex); @@ -2433,9 +2510,6 @@ try_again: (ulong)n, (ulong)offset_high, (ulong)offset, (long)ret); #endif /* __WIN__ */ -#ifdef __WIN__ -error_handling: -#endif retry = os_file_handle_error(NULL, "read"); if (retry) { @@ -2477,13 +2551,11 @@ os_file_read_no_error_handling( #ifdef __WIN__ BOOL ret; DWORD len; - DWORD ret2; - DWORD low; - DWORD high; ibool retry; -#ifndef UNIV_HOTBACKUP - ulint i; -#endif /* !UNIV_HOTBACKUP */ + OVERLAPPED overlapped; + overlapped.Offset = (DWORD)offset; + overlapped.OffsetHigh = (DWORD)offset_high; + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be no more than 32 bits. */ @@ -2498,41 +2570,21 @@ try_again: ut_ad(buf); ut_ad(n > 0); - low = (DWORD) offset; - high = (DWORD) offset_high; - os_mutex_enter(os_file_count_mutex); os_n_pending_reads++; os_mutex_exit(os_file_count_mutex); -#ifndef UNIV_HOTBACKUP - /* Protect the seek / read operation with a mutex */ - i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; - - os_mutex_enter(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); - - if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - os_mutex_enter(os_file_count_mutex); - os_n_pending_reads--; - os_mutex_exit(os_file_count_mutex); - - goto error_handling; + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)offset; + overlapped.OffsetHigh = (DWORD)offset_high; + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); } - - ret = ReadFile(file, buf, (DWORD) n, &len, NULL); - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } os_mutex_enter(os_file_count_mutex); os_n_pending_reads--; os_mutex_exit(os_file_count_mutex); @@ -2554,9 +2606,6 @@ try_again: return(TRUE); } #endif /* __WIN__ */ -#ifdef __WIN__ -error_handling: -#endif retry = os_file_handle_error_no_exit(NULL, "read"); if (retry) { @@ -2609,14 +2658,9 @@ os_file_write( #ifdef __WIN__ BOOL ret; DWORD len; - DWORD ret2; - DWORD low; - DWORD high; ulint n_retries = 0; ulint err; -#ifndef UNIV_HOTBACKUP - ulint i; -#endif /* !UNIV_HOTBACKUP */ + OVERLAPPED overlapped; /* On 64-bit Windows, ulint is 64 bits. But offset and n should be no more than 32 bits. */ @@ -2629,64 +2673,23 @@ os_file_write( ut_ad(buf); ut_ad(n > 0); retry: - low = (DWORD) offset; - high = (DWORD) offset_high; os_mutex_enter(os_file_count_mutex); os_n_pending_writes++; os_mutex_exit(os_file_count_mutex); -#ifndef UNIV_HOTBACKUP - /* Protect the seek / write operation with a mutex */ - i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; - - os_mutex_enter(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - ret2 = SetFilePointer(file, low, &high, FILE_BEGIN); - - if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - - os_mutex_enter(os_file_count_mutex); - os_n_pending_writes--; - os_mutex_exit(os_file_count_mutex); - - ut_print_timestamp(stderr); - - fprintf(stderr, - " InnoDB: Error: File pointer positioning to" - " file %s failed at\n" - "InnoDB: offset %lu %lu. Operating system" - " error number %lu.\n" - "InnoDB: Some operating system error numbers" - " are described at\n" - "InnoDB: " - REFMAN "operating-system-error-codes.html\n", - name, (ulong) offset_high, (ulong) offset, - (ulong) GetLastError()); + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)offset; + overlapped.OffsetHigh = (DWORD)offset_high; - return(FALSE); + overlapped.hEvent = win_get_syncio_event(); + ret = WriteFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); } - - ret = WriteFile(file, buf, (DWORD) n, &len, NULL); - - /* Always do fsync to reduce the probability that when the OS crashes, - a database page is only partially physically written to disk. */ - -# ifdef UNIV_DO_FLUSH - if (!os_do_not_call_flush_at_each_write) { - ut_a(TRUE == os_file_flush(file)); + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); } -# endif /* UNIV_DO_FLUSH */ - -#ifndef UNIV_HOTBACKUP - os_mutex_exit(os_file_seek_mutexes[i]); -#endif /* !UNIV_HOTBACKUP */ - os_mutex_enter(os_file_count_mutex); os_n_pending_writes--; os_mutex_exit(os_file_count_mutex); @@ -3071,9 +3074,6 @@ os_aio_array_create( os_aio_array_t* array; ulint i; os_aio_slot_t* slot; -#ifdef WIN_ASYNC_IO - OVERLAPPED* over; -#endif ut_a(n > 0); ut_a(n_segments > 0); @@ -3089,23 +3089,12 @@ os_aio_array_create( array->n_segments = n_segments; array->n_reserved = 0; array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); -#ifdef __WIN__ - array->native_events = ut_malloc(n * sizeof(os_native_event_t)); -#endif + for (i = 0; i < n; i++) { slot = os_aio_array_get_nth_slot(array, i); - slot->pos = i; slot->reserved = FALSE; -#ifdef WIN_ASYNC_IO - slot->event = os_event_create(NULL); - over = &(slot->control); - - over->hEvent = slot->event->handle; - - *((array->native_events) + i) = over->hEvent; -#endif } return(array); @@ -3119,18 +3108,7 @@ os_aio_array_free( /*==============*/ os_aio_array_t* array) /*!< in, own: array to free */ { -#ifdef WIN_ASYNC_IO - ulint i; - for (i = 0; i < array->n_slots; i++) { - os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); - os_event_free(slot->event); - } -#endif /* WIN_ASYNC_IO */ - -#ifdef __WIN__ - ut_free(array->native_events); -#endif /* __WIN__ */ os_mutex_free(array->mutex); os_event_free(array->not_full); os_event_free(array->is_empty); @@ -3209,7 +3187,11 @@ os_aio_init( } os_last_printout = time(NULL); - +#ifdef _WIN32 + ut_a(completion_port == 0); + completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + ut_a(completion_port); +#endif } /*********************************************************************** @@ -3251,11 +3233,11 @@ os_aio_array_wake_win_aio_at_shutdown( /*==================================*/ os_aio_array_t* array) /*!< in: aio array */ { - ulint i; - - for (i = 0; i < array->n_slots; i++) { + if(completion_port) + { + ut_a(CloseHandle(completion_port)); + completion_port = 0; - os_event_set((array->slots + i)->event); } } #endif @@ -3480,7 +3462,8 @@ found: control = &(slot->control); control->Offset = (DWORD)offset; control->OffsetHigh = (DWORD)offset_high; - os_event_reset(slot->event); + control->hEvent = 0; + slot->arr = array; #endif os_mutex_exit(array->mutex); @@ -3517,9 +3500,6 @@ os_aio_array_free_slot( os_event_set(array->is_empty); } -#ifdef WIN_ASYNC_IO - os_event_reset(slot->event); -#endif os_mutex_exit(array->mutex); } @@ -3689,12 +3669,8 @@ os_aio( os_aio_array_t* array; os_aio_slot_t* slot; #ifdef WIN_ASYNC_IO - ibool retval; - BOOL ret = TRUE; DWORD len = (DWORD) n; - struct fil_node_struct * dummy_mess1; - void* dummy_mess2; - ulint dummy_type; + BOOL ret; #endif ulint err = 0; ibool retry; @@ -3713,26 +3689,23 @@ os_aio( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); - if (mode == OS_AIO_SYNC -#ifdef WIN_ASYNC_IO - && !os_aio_use_native_aio -#endif - ) { + if (mode == OS_AIO_SYNC) + { + ibool ret; /* This is actually an ordinary synchronous read or write: - no need to use an i/o-handler thread. NOTE that if we use - Windows async i/o, Windows does not allow us to use - ordinary synchronous os_file_read etc. on the same file, - therefore we have built a special mechanism for synchronous - wait in the Windows case. */ + no need to use an i/o-handler thread */ if (type == OS_FILE_READ) { - return(_os_file_read(file, buf, offset, - offset_high, n, trx)); + ret = _os_file_read(file, buf, offset, + offset_high, n, trx); } + else { + ut_a(type == OS_FILE_WRITE); - ut_a(type == OS_FILE_WRITE); - - return(os_file_write(name, file, buf, offset, offset_high, n)); + ret = os_file_write(name, file, buf, offset, offset_high, n); + } + ut_a(ret); + return ret; } try_again: @@ -3775,6 +3748,8 @@ try_again: ret = ReadFile(file, buf, (DWORD)n, &len, &(slot->control)); + if(!ret && GetLastError() != ERROR_IO_PENDING) + err = 1; #endif } else { if (!wake_later) { @@ -3789,6 +3764,8 @@ try_again: os_n_file_writes++; ret = WriteFile(file, buf, (DWORD)n, &len, &(slot->control)); + if(!ret && GetLastError() != ERROR_IO_PENDING) + err = 1; #endif } else { if (!wake_later) { @@ -3801,34 +3778,7 @@ try_again: ut_error; } -#ifdef WIN_ASYNC_IO - if (os_aio_use_native_aio) { - if ((ret && len == n) - || (!ret && GetLastError() == ERROR_IO_PENDING)) { - /* aio was queued successfully! */ - - if (mode == OS_AIO_SYNC) { - /* We want a synchronous i/o operation on a - file where we also use async i/o: in Windows - we must use the same wait mechanism as for - async i/o */ - - retval = os_aio_windows_handle(ULINT_UNDEFINED, - slot->pos, - &dummy_mess1, - &dummy_mess2, - &dummy_type, - &space_id); - - return(retval); - } - - return(TRUE); - } - err = 1; /* Fall through the next if */ - } -#endif if (err == 0) { /* aio was queued successfully! */ @@ -3881,52 +3831,20 @@ os_aio_windows_handle( ulint* space_id) { ulint orig_seg = segment; - os_aio_array_t* array; os_aio_slot_t* slot; - ulint n; - ulint i; ibool ret_val; BOOL ret; DWORD len; BOOL retry = FALSE; + ULONG_PTR dummy_key; - if (segment == ULINT_UNDEFINED) { - array = os_aio_sync_array; - segment = 0; - } else { - segment = os_aio_get_array_and_local_segment(&array, segment); - } - - /* NOTE! We only access constant fields in os_aio_array. Therefore - we do not have to acquire the protecting mutex yet */ - - ut_ad(os_aio_validate()); - ut_ad(segment < array->n_segments); - - n = array->n_slots; - - if (array == os_aio_sync_array) { - os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); - i = pos; - } else { - srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); - i = os_event_wait_multiple(n, - (array->native_events) - ); - } - - os_mutex_enter(array->mutex); - - slot = os_aio_array_get_nth_slot(array, i); + ret = GetQueuedCompletionStatus(completion_port, &len, &dummy_key, + (OVERLAPPED **)&slot, INFINITE); - ut_a(slot->reserved); - - if (orig_seg != ULINT_UNDEFINED) { - srv_set_io_thread_op_info(orig_seg, - "get windows aio return value"); + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); } - ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE); *message1 = slot->message1; *message2 = slot->message2; @@ -3951,8 +3869,6 @@ os_aio_windows_handle( ret_val = FALSE; } - os_mutex_exit(array->mutex); - if (retry) { /* retry failed read/write operation synchronously. No need to hold array->mutex. */ @@ -3961,37 +3877,19 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret = WriteFile(slot->file, slot->buf, - (DWORD) slot->len, &len, - &(slot->control)); - + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); break; case OS_FILE_READ: - ret = ReadFile(slot->file, slot->buf, - (DWORD) slot->len, &len, - &(slot->control)); - + ret_val = os_file_read(slot->file, slot->buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); break; default: ut_error; } - - if (!ret && GetLastError() == ERROR_IO_PENDING) { - /* aio was queued successfully! - We want a synchronous i/o operation on a - file where we also use async i/o: in Windows - we must use the same wait mechanism as for - async i/o */ - - ret = GetOverlappedResult(slot->file, - &(slot->control), - &len, TRUE); - } - - ret_val = ret && len == slot->len; } - os_aio_array_free_slot(array, slot); + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); return(ret_val); } diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.c index dba997927cb..75bd6d44c2e 100644 --- a/storage/xtradb/os/os0sync.c +++ b/storage/xtradb/os/os0sync.c @@ -38,6 +38,7 @@ Created 9/6/1995 Heikki Tuuri #include "ut0mem.h" #include "srv0start.h" +#include "srv0srv.h" /* Type definition for an operating system mutex struct */ struct os_mutex_struct{ @@ -74,11 +75,225 @@ UNIV_INTERN ulint os_event_count = 0; UNIV_INTERN ulint os_mutex_count = 0; UNIV_INTERN ulint os_fast_mutex_count = 0; +/* The number of microsecnds in a second. */ +static const ulint MICROSECS_IN_A_SECOND = 1000000; + /* Because a mutex is embedded inside an event and there is an event embedded inside a mutex, on free, this generates a recursive call. This version of the free event function doesn't acquire the global lock */ static void os_event_free_internal(os_event_t event); +/* On Windows (Vista and later), load function pointers for condition +variable handling. Those functions are not available in prior versions, +so we have to use them via runtime loading, as long as we support XP. */ +static void os_cond_module_init(void); + +#ifdef __WIN__ +/* Prototypes and function pointers for condition variable functions */ +typedef VOID (WINAPI* InitializeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static InitializeConditionVariableProc initialize_condition_variable; + +typedef BOOL (WINAPI* SleepConditionVariableCSProc) + (PCONDITION_VARIABLE ConditionVariable, + PCRITICAL_SECTION CriticalSection, + DWORD dwMilliseconds); +static SleepConditionVariableCSProc sleep_condition_variable; + +typedef VOID (WINAPI* WakeAllConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeAllConditionVariableProc wake_all_condition_variable; + +typedef VOID (WINAPI* WakeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeConditionVariableProc wake_condition_variable; +#endif + +/*********************************************************//** +Initialitze condition variable */ +UNIV_INLINE +void +os_cond_init( +/*=========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(initialize_condition_variable != NULL); + initialize_condition_variable(cond); +#else + ut_a(pthread_cond_init(cond, NULL) == 0); +#endif +} + +/*********************************************************//** +Do a timed wait on condition variable. +@return TRUE if timed out, FALSE otherwise */ +UNIV_INLINE +ibool +os_cond_wait_timed( +/*===============*/ + os_cond_t* cond, /*!< in: condition variable. */ + os_fast_mutex_t* mutex, /*!< in: fast mutex */ +#ifndef __WIN__ + const struct timespec* abstime /*!< in: timeout */ +#else + DWORD time_in_ms /*!< in: timeout in + milliseconds*/ +#endif /* !__WIN__ */ +) +{ +#ifdef __WIN__ + BOOL ret; + DWORD err; + + ut_a(sleep_condition_variable != NULL); + + ret = sleep_condition_variable(cond, mutex, time_in_ms); + + if (!ret) { + err = GetLastError(); + /* From http://msdn.microsoft.com/en-us/library/ms686301%28VS.85%29.aspx, + "Condition variables are subject to spurious wakeups + (those not associated with an explicit wake) and stolen wakeups + (another thread manages to run before the woken thread)." + Check for both types of timeouts. + Conditions are checked by the caller.*/ + if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { + return(TRUE); + } + } + + ut_a(ret); + + return(FALSE); +#else + int ret; + + ret = pthread_cond_timedwait(cond, mutex, abstime); + + switch (ret) { + case 0: + case ETIMEDOUT: + /* We play it safe by checking for EINTR even though + according to the POSIX documentation it can't return EINTR. */ + case EINTR: + break; + + default: + fprintf(stderr, " InnoDB: pthread_cond_timedwait() returned: " + "%d: abstime={%lu,%lu}\n", + ret, (ulong) abstime->tv_sec, (ulong) abstime->tv_nsec); + ut_error; + } + + return(ret == ETIMEDOUT); +#endif +} +/*********************************************************//** +Wait on condition variable */ +UNIV_INLINE +void +os_cond_wait( +/*=========*/ + os_cond_t* cond, /*!< in: condition variable. */ + os_fast_mutex_t* mutex) /*!< in: fast mutex */ +{ + ut_a(cond); + ut_a(mutex); + +#ifdef __WIN__ + ut_a(sleep_condition_variable != NULL); + ut_a(sleep_condition_variable(cond, mutex, INFINITE)); +#else + ut_a(pthread_cond_wait(cond, mutex) == 0); +#endif +} + +/*********************************************************//** +Wakes all threads waiting for condition variable */ +UNIV_INLINE +void +os_cond_broadcast( +/*==============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_all_condition_variable != NULL); + wake_all_condition_variable(cond); +#else + ut_a(pthread_cond_broadcast(cond) == 0); +#endif +} + +/*********************************************************//** +Wakes one thread waiting for condition variable */ +UNIV_INLINE +void +os_cond_signal( +/*==========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_condition_variable != NULL); + wake_condition_variable(cond); +#else + ut_a(pthread_cond_signal(cond) == 0); +#endif +} + +/*********************************************************//** +Destroys condition variable */ +UNIV_INLINE +void +os_cond_destroy( +/*============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ +#ifdef __WIN__ + /* Do nothing */ +#else + ut_a(pthread_cond_destroy(cond) == 0); +#endif +} + +/*********************************************************//** +On Windows (Vista and later), load function pointers for condition variable +handling. Those functions are not available in prior versions, so we have to +use them via runtime loading, as long as we support XP. */ +static +void +os_cond_module_init(void) +/*=====================*/ +{ +#ifdef __WIN__ + HMODULE h_dll; + + + h_dll = GetModuleHandle("kernel32"); + + initialize_condition_variable = (InitializeConditionVariableProc) + GetProcAddress(h_dll, "InitializeConditionVariable"); + sleep_condition_variable = (SleepConditionVariableCSProc) + GetProcAddress(h_dll, "SleepConditionVariableCS"); + wake_all_condition_variable = (WakeAllConditionVariableProc) + GetProcAddress(h_dll, "WakeAllConditionVariable"); + wake_condition_variable = (WakeConditionVariableProc) + GetProcAddress(h_dll, "WakeConditionVariable"); + + /* When using native condition variables, check function pointers */ + ut_a(initialize_condition_variable); + ut_a(sleep_condition_variable); + ut_a(wake_all_condition_variable); + ut_a(wake_condition_variable); +#endif +} + /*********************************************************//** Initializes global event and OS 'slow' mutex lists. */ UNIV_INTERN @@ -92,6 +307,9 @@ os_sync_init(void) os_sync_mutex = NULL; os_sync_mutex_inited = FALSE; + /* Now for Windows only */ + os_cond_module_init(); + os_sync_mutex = os_mutex_create(NULL); os_sync_mutex_inited = TRUE; @@ -146,42 +364,45 @@ os_event_create( const char* name) /*!< in: the name of the event, if NULL the event is created without a name */ { -#ifdef __WIN__ - os_event_t event; - - event = ut_malloc(sizeof(struct os_event_struct)); - - event->handle = CreateEvent(NULL, /* No security attributes */ - TRUE, /* Manual reset */ - FALSE, /* Initial state nonsignaled */ - (LPCTSTR) name); - if (!event->handle) { - fprintf(stderr, - "InnoDB: Could not create a Windows event semaphore;" - " Windows error %lu\n", - (ulong) GetLastError()); - } -#else /* Unix */ os_event_t event; - UT_NOT_USED(name); +#ifdef __WIN__ + if(!srv_use_native_conditions) { + + event = ut_malloc(sizeof(struct os_event_struct)); + + event->handle = CreateEvent(NULL, + TRUE, + FALSE, + (LPCTSTR) name); + if (!event->handle) { + fprintf(stderr, + "InnoDB: Could not create a Windows event" + " semaphore; Windows error %lu\n", + (ulong) GetLastError()); + } + } else /* Windows with condition variables */ +#endif - event = ut_malloc(sizeof(struct os_event_struct)); + { + UT_NOT_USED(name); - os_fast_mutex_init(&(event->os_mutex)); + event = ut_malloc(sizeof(struct os_event_struct)); - ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); + os_fast_mutex_init(&(event->os_mutex)); - event->is_set = FALSE; + os_cond_init(&(event->cond_var)); - /* We return this value in os_event_reset(), which can then be - be used to pass to the os_event_wait_low(). The value of zero - is reserved in os_event_wait_low() for the case when the - caller does not want to pass any signal_count value. To - distinguish between the two cases we initialize signal_count - to 1 here. */ - event->signal_count = 1; -#endif /* __WIN__ */ + event->is_set = FALSE; + + /* We return this value in os_event_reset(), which can then be + be used to pass to the os_event_wait_low(). The value of zero + is reserved in os_event_wait_low() for the case when the + caller does not want to pass any signal_count value. To + distinguish between the two cases we initialize signal_count + to 1 here. */ + event->signal_count = 1; + } /* The os_sync_mutex can be NULL because during startup an event can be created [ because it's embedded in the mutex/rwlock ] before @@ -211,10 +432,15 @@ os_event_set( /*=========*/ os_event_t event) /*!< in: event to set */ { -#ifdef __WIN__ ut_a(event); - ut_a(SetEvent(event->handle)); -#else + +#ifdef __WIN__ + if (!srv_use_native_conditions) { + ut_a(SetEvent(event->handle)); + return; + } +#endif + ut_a(event); os_fast_mutex_lock(&(event->os_mutex)); @@ -224,11 +450,10 @@ os_event_set( } else { event->is_set = TRUE; event->signal_count += 1; - ut_a(0 == pthread_cond_broadcast(&(event->cond_var))); + os_cond_broadcast(&(event->cond_var)); } os_fast_mutex_unlock(&(event->os_mutex)); -#endif } /**********************************************************//** @@ -247,12 +472,14 @@ os_event_reset( { ib_int64_t ret = 0; -#ifdef __WIN__ ut_a(event); - ut_a(ResetEvent(event->handle)); -#else - ut_a(event); +#ifdef __WIN__ + if(!srv_use_native_conditions) { + ut_a(ResetEvent(event->handle)); + return(0); + } +#endif os_fast_mutex_lock(&(event->os_mutex)); @@ -264,7 +491,6 @@ os_event_reset( ret = event->signal_count; os_fast_mutex_unlock(&(event->os_mutex)); -#endif return(ret); } @@ -277,17 +503,20 @@ os_event_free_internal( os_event_t event) /*!< in: event to free */ { #ifdef __WIN__ - ut_a(event); + if(!srv_use_native_conditions) { + ut_a(event); + ut_a(CloseHandle(event->handle)); + } else +#endif + { + ut_a(event); - ut_a(CloseHandle(event->handle)); -#else - ut_a(event); + /* This is to avoid freeing the mutex twice */ + os_fast_mutex_free(&(event->os_mutex)); - /* This is to avoid freeing the mutex twice */ - os_fast_mutex_free(&(event->os_mutex)); + os_cond_destroy(&(event->cond_var)); + } - ut_a(0 == pthread_cond_destroy(&(event->cond_var))); -#endif /* Remove from the list of events */ UT_LIST_REMOVE(os_event_list, os_event_list, event); @@ -306,16 +535,18 @@ os_event_free( os_event_t event) /*!< in: event to free */ { -#ifdef __WIN__ ut_a(event); +#ifdef __WIN__ + if(!srv_use_native_conditions){ + ut_a(CloseHandle(event->handle)); + } else /*Windows with condition variables */ +#endif + { + os_fast_mutex_free(&(event->os_mutex)); - ut_a(CloseHandle(event->handle)); -#else - ut_a(event); + os_cond_destroy(&(event->cond_var)); + } - os_fast_mutex_free(&(event->os_mutex)); - ut_a(0 == pthread_cond_destroy(&(event->cond_var))); -#endif /* Remove from the list of events */ os_mutex_enter(os_sync_mutex); @@ -358,23 +589,24 @@ os_event_wait_low( returned by previous call of os_event_reset(). */ { -#ifdef __WIN__ - DWORD err; + + ib_int64_t old_signal_count; - ut_a(event); +#ifdef __WIN__ + if(!srv_use_native_conditions) { + DWORD err; - UT_NOT_USED(reset_sig_count); + ut_a(event); - /* Specify an infinite time limit for waiting */ - err = WaitForSingleObject(event->handle, INFINITE); + UT_NOT_USED(reset_sig_count); - ut_a(err == WAIT_OBJECT_0); + /* Specify an infinite wait */ + err = WaitForSingleObject(event->handle, INFINITE); - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); + ut_a(err == WAIT_OBJECT_0); + return; } -#else - ib_int64_t old_signal_count; +#endif os_fast_mutex_lock(&(event->os_mutex)); @@ -399,13 +631,12 @@ os_event_wait_low( return; } - pthread_cond_wait(&(event->cond_var), &(event->os_mutex)); + os_cond_wait(&(event->cond_var), &(event->os_mutex)); /* Solaris manual said that spurious wakeups may occur: we have to check if the event really has been signaled after we came here to wait */ } -#endif } /**********************************************************//** @@ -414,112 +645,112 @@ a timeout is exceeded. @return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ UNIV_INTERN ulint -os_event_wait_time( -/*===============*/ - os_event_t event, /*!< in: event to wait */ - ulint wtime) /*!< in: timeout in microseconds, or - OS_SYNC_INFINITE_TIME */ +os_event_wait_time_low( +/*===================*/ + os_event_t event, /*!< in: event to wait */ + ulint time_in_usec, /*!< in: timeout in + microseconds, or + OS_SYNC_INFINITE_TIME */ + ib_int64_t reset_sig_count) /*!< in: zero or the value + returned by previous call of + os_event_reset(). */ + { + ibool timed_out = FALSE; + #ifdef __WIN__ - DWORD err; + DWORD time_in_ms; - ut_a(event); + if (!srv_use_native_conditions) { + DWORD err; - if (wtime != OS_SYNC_INFINITE_TIME) { - err = WaitForSingleObject(event->handle, (DWORD) wtime / 1000); - } else { - err = WaitForSingleObject(event->handle, INFINITE); - } + ut_a(event); - if (err == WAIT_OBJECT_0) { + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + time_in_ms = time_in_usec / 1000; + err = WaitForSingleObject(event->handle, time_in_ms); + } else { + err = WaitForSingleObject(event->handle, INFINITE); + } - return(0); - } else if (err == WAIT_TIMEOUT) { + if (err == WAIT_OBJECT_0) { + return(0); + } else if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) { + return(OS_SYNC_TIME_EXCEEDED); + } - return(OS_SYNC_TIME_EXCEEDED); - } else { ut_error; - return(1000000); /* dummy value to eliminate compiler warn. */ + /* Dummy value to eliminate compiler warning. */ + return(42); + } else { + ut_a(sleep_condition_variable != NULL); + + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + time_in_ms = time_in_usec / 1000; + } else { + time_in_ms = INFINITE; + } } #else - int err; - int ret = 0; - ulint tmp; - ib_int64_t old_count; - struct timeval tv_start; - struct timespec timeout; - - if (wtime == OS_SYNC_INFINITE_TIME) { - os_event_wait(event); - return 0; - } + struct timespec abstime; - /* Compute the absolute point in time at which to time out. */ - gettimeofday(&tv_start, NULL); - tmp = tv_start.tv_usec + wtime; - timeout.tv_sec = tv_start.tv_sec + (tmp / 1000000); - timeout.tv_nsec = (tmp % 1000000) * 1000; + if (time_in_usec != OS_SYNC_INFINITE_TIME) { + struct timeval tv; + int ret; + ulint sec; + ulint usec; - os_fast_mutex_lock(&(event->os_mutex)); - old_count = event->signal_count; + ret = ut_usectime(&sec, &usec); + ut_a(ret == 0); - for (;;) { - if (event->is_set == TRUE || event->signal_count != old_count) - break; + tv.tv_sec = sec; + tv.tv_usec = usec; - err = pthread_cond_timedwait(&(event->cond_var), - &(event->os_mutex), &timeout); - if (err == ETIMEDOUT) { - ret = OS_SYNC_TIME_EXCEEDED; - break; + tv.tv_usec += time_in_usec; + + if ((ulint) tv.tv_usec >= MICROSECS_IN_A_SECOND) { + tv.tv_sec += time_in_usec / MICROSECS_IN_A_SECOND; + tv.tv_usec %= MICROSECS_IN_A_SECOND; } + + abstime.tv_sec = tv.tv_sec; + abstime.tv_nsec = tv.tv_usec * 1000; + } else { + abstime.tv_nsec = 999999999; + abstime.tv_sec = (time_t) ULINT_MAX; } - os_fast_mutex_unlock(&(event->os_mutex)); + ut_a(abstime.tv_nsec <= 999999999); + +#endif /* __WIN__ */ - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_fast_mutex_lock(&event->os_mutex); - os_thread_exit(NULL); + if (!reset_sig_count) { + reset_sig_count = event->signal_count; } - return ret; -#endif -} + do { + if (event->is_set || event->signal_count != reset_sig_count) { -#ifdef __WIN__ -/**********************************************************//** -Waits for any event in an OS native event array. Returns if even a single -one is signaled or becomes signaled. -@return index of the event which was signaled */ -UNIV_INTERN -ulint -os_event_wait_multiple( -/*===================*/ - ulint n, /*!< in: number of events in the - array */ - os_native_event_t* native_event_array) - /*!< in: pointer to an array of event - handles */ -{ - DWORD index; + break; + } - ut_a(native_event_array); - ut_a(n > 0); + timed_out = os_cond_wait_timed( + &event->cond_var, &event->os_mutex, +#ifndef __WIN__ + &abstime +#else + time_in_ms +#endif /* !__WIN__ */ + ); - index = WaitForMultipleObjects((DWORD) n, native_event_array, - FALSE, /* Wait for any 1 event */ - INFINITE); /* Infinite wait time - limit */ - ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparison */ - ut_a(index < WAIT_OBJECT_0 + n); + } while (!timed_out); - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); - } + os_fast_mutex_unlock(&event->os_mutex); - return(index - WAIT_OBJECT_0); + return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0); } -#endif /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the @@ -532,15 +763,6 @@ os_mutex_create( const char* name) /*!< in: the name of the mutex, if NULL the mutex is created without a name */ { -#ifdef __WIN__ - HANDLE mutex; - os_mutex_t mutex_str; - - mutex = CreateMutex(NULL, /* No security attributes */ - FALSE, /* Initial state: no owner */ - (LPCTSTR) name); - ut_a(mutex); -#else os_fast_mutex_t* mutex; os_mutex_t mutex_str; @@ -549,7 +771,6 @@ os_mutex_create( mutex = ut_malloc(sizeof(os_fast_mutex_t)); os_fast_mutex_init(mutex); -#endif mutex_str = ut_malloc(sizeof(os_mutex_str_t)); mutex_str->handle = mutex; @@ -580,25 +801,11 @@ os_mutex_enter( /*===========*/ os_mutex_t mutex) /*!< in: mutex to acquire */ { -#ifdef __WIN__ - DWORD err; - - ut_a(mutex); - - /* Specify infinite time limit for waiting */ - err = WaitForSingleObject(mutex->handle, INFINITE); - - ut_a(err == WAIT_OBJECT_0); - - (mutex->count)++; - ut_a(mutex->count == 1); -#else os_fast_mutex_lock(mutex->handle); (mutex->count)++; ut_a(mutex->count == 1); -#endif } /**********************************************************//** @@ -614,11 +821,7 @@ os_mutex_exit( ut_a(mutex->count == 1); (mutex->count)--; -#ifdef __WIN__ - ut_a(ReleaseMutex(mutex->handle)); -#else os_fast_mutex_unlock(mutex->handle); -#endif } /**********************************************************//** @@ -647,15 +850,9 @@ os_mutex_free( os_mutex_exit(os_sync_mutex); } -#ifdef __WIN__ - ut_a(CloseHandle(mutex->handle)); - - ut_free(mutex); -#else os_fast_mutex_free(mutex->handle); ut_free(mutex->handle); ut_free(mutex); -#endif } /*********************************************************//** diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c index 1e078e1a6e9..9a142e1ca86 100644 --- a/storage/xtradb/srv/srv0srv.c +++ b/storage/xtradb/srv/srv0srv.c @@ -139,6 +139,20 @@ UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX; /** Place locks to records only i.e. do not use next-key locking except on duplicate key checking and foreign key checking */ UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE; +#ifdef __WIN__ +/* Windows native condition variables. We use runtime loading / function +pointers, because they are not available on Windows Server 2003 and +Windows XP/2000. + +We use condition for events on Windows if possible, even if os_event +resembles Windows kernel event object well API-wise. The reason is +performance, kernel objects are heavyweights and WaitForSingleObject() is a +performance killer causing calling thread to context switch. Besides, Innodb +is preallocating large number (often millions) of os_events. With kernel event +objects it takes a big chunk out of non-paged pool, which is better suited +for tasks like IO than for storing idle event objects. */ +UNIV_INTERN ibool srv_use_native_conditions = FALSE; +#endif /* __WIN__ */ UNIV_INTERN ulint srv_n_data_files = 0; UNIV_INTERN char** srv_data_file_names = NULL; diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c index cef045d72e1..d002a1bb682 100644 --- a/storage/xtradb/srv/srv0start.c +++ b/storage/xtradb/srv/srv0start.c @@ -1265,6 +1265,7 @@ innobase_start_or_create_for_mysql(void) case OS_WIN95: case OS_WIN31: case OS_WINNT: + srv_use_native_conditions = FALSE; /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1, and NT use simulated aio. In NT Windows provides async i/o, but when run in conjunction with InnoDB Hot Backup, it seemed @@ -1272,24 +1273,26 @@ innobase_start_or_create_for_mysql(void) os_aio_use_native_aio = FALSE; break; - default: - /* On Win 2000 and XP use async i/o */ - //os_aio_use_native_aio = TRUE; - os_aio_use_native_aio = FALSE; - fprintf(stderr, - "InnoDB: Windows native async i/o is disabled as default.\n" - "InnoDB: It is not applicable for the current" - " multi io threads implementation.\n"); - break; + + case OS_WIN2000: + case OS_WINXP: + /* On 2000 and XP, async IO is available, but no condition variables. */ + os_aio_use_native_aio = TRUE; + srv_use_native_conditions = FALSE; + break; + + default: + os_aio_use_native_aio = TRUE; + srv_use_native_conditions = TRUE; } #endif + if (srv_file_flush_method_str == NULL) { /* These are the default options */ srv_unix_file_flush_method = SRV_UNIX_FSYNC; srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; -#ifndef __WIN__ } else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) { srv_unix_file_flush_method = SRV_UNIX_FSYNC; @@ -1307,7 +1310,7 @@ innobase_start_or_create_for_mysql(void) } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) { srv_unix_file_flush_method = SRV_UNIX_NOSYNC; -#else +#ifdef _WIN32 } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) { srv_win_file_flush_method = SRV_WIN_IO_NORMAL; os_aio_use_native_aio = FALSE; @@ -1315,16 +1318,10 @@ innobase_start_or_create_for_mysql(void) } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) { srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; os_aio_use_native_aio = FALSE; - } else if (0 == ut_strcmp(srv_file_flush_method_str, "async_unbuffered")) { srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; os_aio_use_native_aio = TRUE; - srv_n_read_io_threads = srv_n_write_io_threads = 1; - fprintf(stderr, - "InnoDB: 'async_unbuffered' was detected as innodb_flush_method.\n" - "InnoDB: Windows native async i/o is enabled.\n" - "InnoDB: And io threads are restricted.\n"); #endif } else { fprintf(stderr, |