diff options
author | Marko Mäkelä <marko.makela@mariadb.com> | 2019-06-27 10:53:18 +0300 |
---|---|---|
committer | Marko Mäkelä <marko.makela@mariadb.com> | 2019-06-27 10:53:18 +0300 |
commit | 042fc2959705eba79b2eae6031d5f9ca5c454c01 (patch) | |
tree | 939ae0ae22856c8b4804f84f2f5438640c8cc35b | |
parent | 620f4f8af98666e2efb7e14fb2663ab85b52bc12 (diff) | |
download | mariadb-git-042fc2959705eba79b2eae6031d5f9ca5c454c01.tar.gz |
MDEV-19845: Adaptive spin loops
Starting with the Intel Skylake microarchitecture, the PAUSE
instruction latency is about 140 clock cycles instead of earlier 10.
On AMD processors, the latency could be 10 or 50 clock cycles,
depending on microarchitecture.
Because of this big range of latency, let us scale the loops around
the PAUSE instruction based on timing results at server startup.
my_cpu_relax_multiplier: New variable: How many times to invoke PAUSE
in a loop. Only defined for IA-32 and AMD64.
my_cpu_init(): Determine with RDTSC the time to run 16 PAUSE instructions
in two unrolled loops according, and based on the quicker of the two
runs, initialize my_cpu_relax_multiplier. This form of calibration was
suggested by Mikhail Sinyavin from Intel.
LF_BACKOFF(), ut_delay(): Use my_cpu_relax_multiplier when available.
ut_delay(): Define inline in my_cpu.h.
UT_COMPILER_BARRIER(): Remove. This does not seem to have any effect,
because in our ut_delay() implementation, no computations are being
performed inside the loop. The purpose of UT_COMPILER_BARRIER() was to
prohibit the compiler from reordering computations. It was not
emitting any code.
-rw-r--r-- | config.h.cmake | 2 | ||||
-rw-r--r-- | configure.cmake | 26 | ||||
-rw-r--r-- | include/my_cpu.h | 54 | ||||
-rw-r--r-- | mysys/CMakeLists.txt | 2 | ||||
-rw-r--r-- | mysys/my_cpu.c | 81 | ||||
-rw-r--r-- | sql/mysqld.cc | 1 | ||||
-rw-r--r-- | storage/innobase/include/ib0mutex.h | 5 | ||||
-rw-r--r-- | storage/innobase/include/ut0ut.h | 17 | ||||
-rw-r--r-- | storage/innobase/ut/ut0ut.cc | 23 |
9 files changed, 128 insertions, 83 deletions
diff --git a/config.h.cmake b/config.h.cmake index b8a77899c4d..765d75dfb23 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -187,8 +187,6 @@ #cmakedefine HAVE_LINUX_FALLOC_H 1 #cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1 #cmakedefine HAVE_PREAD 1 -#cmakedefine HAVE_PAUSE_INSTRUCTION 1 -#cmakedefine HAVE_FAKE_PAUSE_INSTRUCTION 1 #cmakedefine HAVE_RDTSCLL 1 #cmakedefine HAVE_READ_REAL_TIME 1 #cmakedefine HAVE_PTHREAD_ATTR_CREATE 1 diff --git a/configure.cmake b/configure.cmake index 67ed6503e3e..e75810f8150 100644 --- a/configure.cmake +++ b/configure.cmake @@ -758,32 +758,6 @@ IF(NOT C_HAS_inline) ENDIF() ENDIF() -IF(NOT CMAKE_CROSSCOMPILING AND NOT MSVC) - STRING(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} processor) - IF(processor MATCHES "86" OR processor MATCHES "amd64" OR processor MATCHES "x64") - #Check for x86 PAUSE instruction - # We have to actually try running the test program, because of a bug - # in Solaris on x86_64, where it wrongly reports that PAUSE is not - # supported when trying to run an application. See - # http://bugs.opensolaris.org/bugdatabase/printableBug.do?bug_id=6478684 - CHECK_C_SOURCE_RUNS(" - int main() - { - __asm__ __volatile__ (\"pause\"); - return 0; - }" HAVE_PAUSE_INSTRUCTION) - ENDIF() - IF (NOT HAVE_PAUSE_INSTRUCTION) - CHECK_C_SOURCE_COMPILES(" - int main() - { - __asm__ __volatile__ (\"rep; nop\"); - return 0; - } - " HAVE_FAKE_PAUSE_INSTRUCTION) - ENDIF() -ENDIF() - CHECK_SYMBOL_EXISTS(tcgetattr "termios.h" HAVE_TCGETATTR 1) # diff --git a/include/my_cpu.h b/include/my_cpu.h index b5665fc108c..0e37eafe60e 100644 --- a/include/my_cpu.h +++ b/include/my_cpu.h @@ -46,10 +46,20 @@ #define HMT_high() #endif +#if defined __i386__ || defined __x86_64__ || defined _WIN32 +# define HAVE_PAUSE_INSTRUCTION /* added in Intel Pentium 4 */ +#endif static inline void MY_RELAX_CPU(void) { -#ifdef HAVE_PAUSE_INSTRUCTION +#ifdef _WIN32 + /* + In the Win32 API, the x86 PAUSE instruction is executed by calling + the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- + independent way by using YieldProcessor. + */ + YieldProcessor(); +#elif defined HAVE_PAUSE_INSTRUCTION /* According to the gcc info page, asm volatile means that the instruction has important side-effects and must not be removed. @@ -61,16 +71,6 @@ static inline void MY_RELAX_CPU(void) #else __asm__ __volatile__ ("pause"); #endif - -#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION) - __asm__ __volatile__ ("rep; nop"); -#elif defined _WIN32 - /* - In the Win32 API, the x86 PAUSE instruction is executed by calling - the YieldProcessor macro defined in WinNT.h. It is a CPU architecture- - independent way by using YieldProcessor. - */ - YieldProcessor(); #elif defined(_ARCH_PWR8) __ppc_get_timebase(); #else @@ -81,6 +81,20 @@ static inline void MY_RELAX_CPU(void) } +#ifdef HAVE_PAUSE_INSTRUCTION +# ifdef __cplusplus +extern "C" { +# endif +extern unsigned my_cpu_relax_multiplier; +void my_cpu_init(void); +# ifdef __cplusplus +} +# endif +#else +# define my_cpu_relax_multiplier 200 +# define my_cpu_init() /* nothing */ +#endif + /* LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel recommends to use it in spin loops also on non-HT machines to reduce power @@ -94,9 +108,23 @@ static inline void MY_RELAX_CPU(void) static inline int LF_BACKOFF(void) { - int i; - for (i= 0; i < 200; i++) + unsigned i= my_cpu_relax_multiplier; + while (i--) MY_RELAX_CPU(); return 1; } + +/** + Run a delay loop while waiting for a shared resource to be released. + @param delay originally, roughly microseconds on 100 MHz Intel Pentium +*/ +static inline void ut_delay(unsigned delay) +{ + unsigned i= my_cpu_relax_multiplier / 4 * delay; + HMT_low(); + while (i--) + MY_RELAX_CPU(); + HMT_medium(); +} + #endif diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index 6990d1350e3..438d6b428e0 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -44,7 +44,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c my_getncpus.c my_safehash.c my_chmod.c my_rnd.c my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c my_rdtsc.c my_context.c psi_noop.c - my_atomic_writes.c my_likely.c + my_atomic_writes.c my_cpu.c my_likely.c file_logger.c my_dlerror.c) IF (WIN32) diff --git a/mysys/my_cpu.c b/mysys/my_cpu.c new file mode 100644 index 00000000000..cd13624df0f --- /dev/null +++ b/mysys/my_cpu.c @@ -0,0 +1,81 @@ +/* Copyright (c) 2019, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include <my_global.h> +#include <my_cpu.h> + +#ifdef HAVE_PAUSE_INSTRUCTION +/** How many times to invoke PAUSE in a loop */ +unsigned my_cpu_relax_multiplier = 200; + +# include <stdint.h> + +# ifdef _MSC_VER +# include <intrin.h> +# else +# include <x86intrin.h> +# endif + +#define PAUSE4 MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU() +#define PAUSE16 PAUSE4; PAUSE4; PAUSE4; PAUSE4 + +/** + Initialize my_cpu_relax_multiplier. + + Determine the duration of a PAUSE instruction by running an + unrolled loop of 16 PAUSE instructions twice, and taking the + faster of the two runs. In this way, even if the execution is + interrupted by the operating system, it should be extremely + unlikely that both loops get interrupted. + + On the Intel Skylake microarchitecture, the PAUSE instruction takes + around 140 clock cycles, while on earlier microarchitectures it could + be 10 clock cycles or less. Scale the PAUSE loop counter accordingly. + + On a pre-Skylake Intel Xeon CPU E5-2630 v4 @ 2.20GHz running an AMD64 + executable, the numbers would be between 172 and 220 when all the code + is inlined as follows: + + rdtsc,mov,shl,or, 16*pause, + rdtsc,mov,shl,or, 16*pause, + rdtsc. + + That would yield 11 to 14 cycles per PAUSE instruction even if we + (wrongly) ignore the overhead of the other instructions. + + On a Skylake mobile processor Intel Core i7-6500U CPU @ 2.50GHz, the + numbers would range from 1896 to 2410 (or 1976 if taking the minimum + of two runs), yielding 118 to 151 (or 123) cycles per PAUSE instruction. + + Let us define a threshold at roughly 30 cycles per PAUSE instruction, + and use a shorter delay if the PAUSE instruction takes longer than + that. In some AMD processors, the PAUSE instruction could take 40 or + 50 cycles. Let us use a shorter delay multiplier for them as well. + + The 1/10 scaling factor (200/20) was derived experimentally by + Mikhail Sinyavin from Intel. +*/ +void my_cpu_init(void) +{ + uint64_t t0, t1, t2; + t0= __rdtsc(); + PAUSE16; + t1= __rdtsc(); + PAUSE16; + t2= __rdtsc(); + if (t2 - t1 > 30 * 16 && t1 - t0 > 30 * 16) + my_cpu_relax_multiplier= 20; +} +#endif diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 4b31b78fc21..f98f51c73d0 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -5113,6 +5113,7 @@ static int init_server_components() We need to call each of these following functions to ensure that all things are initialized so that unireg_abort() doesn't fail */ + my_cpu_init(); mdl_init(); if (tdc_init() || hostname_cache_init()) unireg_abort(1); diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index 1b8ec8d0fe4..3e99eb79416 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,8 +29,7 @@ Created 2013-03-26 Sunny Bains. #ifndef ib0mutex_h #define ib0mutex_h -#include "ut0ut.h" -#include "ut0rnd.h" +#include "my_cpu.h" #include "os0event.h" #include "sync0arr.h" diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index b54e41ea614..51e00c6f0fe 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -54,14 +54,6 @@ Created 1/20/1994 Heikki Tuuri /** Time stamp */ typedef time_t ib_time_t; -#if defined (__GNUC__) -# define UT_COMPILER_BARRIER() __asm__ __volatile__ ("":::"memory") -#elif defined (_MSC_VER) -# define UT_COMPILER_BARRIER() _ReadWriteBarrier() -#else -# define UT_COMPILER_BARRIER() -#endif - /*********************************************************************//** Delays execution for at most max_wait_us microseconds or returns earlier if cond becomes true. @@ -270,14 +262,7 @@ void ut_sprintf_timestamp( /*=================*/ char* buf); /*!< in: buffer where to sprintf */ -/*************************************************************//** -Runs an idle loop on CPU. The argument gives the desired delay -in microseconds on 100 MHz Pentium + Visual C++. -@return dummy value */ -void -ut_delay( -/*=====*/ - ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */ + /*************************************************************//** Prints the contents of a memory buffer in hex and ascii. */ void diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc index 7ee015f8f38..42054f309c7 100644 --- a/storage/innobase/ut/ut0ut.cc +++ b/storage/innobase/ut/ut0ut.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2019, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -284,27 +284,6 @@ ut_sprintf_timestamp( } /*************************************************************//** -Runs an idle loop on CPU. The argument gives the desired delay -in microseconds on 100 MHz Pentium + Visual C++. -@return dummy value */ -void -ut_delay( -/*=====*/ - ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */ -{ - ulint i; - - HMT_low(); - - for (i = 0; i < delay * 50; i++) { - MY_RELAX_CPU(); - UT_COMPILER_BARRIER(); - } - - HMT_medium(); -} - -/*************************************************************//** Prints the contents of a memory buffer in hex and ascii. */ void ut_print_buf( |