summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2019-06-27 10:53:18 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2019-06-27 10:53:18 +0300
commit042fc2959705eba79b2eae6031d5f9ca5c454c01 (patch)
tree939ae0ae22856c8b4804f84f2f5438640c8cc35b /include
parent620f4f8af98666e2efb7e14fb2663ab85b52bc12 (diff)
downloadmariadb-git-042fc2959705eba79b2eae6031d5f9ca5c454c01.tar.gz
MDEV-19845: Adaptive spin loops
Starting with the Intel Skylake microarchitecture, the PAUSE instruction latency is about 140 clock cycles instead of earlier 10. On AMD processors, the latency could be 10 or 50 clock cycles, depending on microarchitecture. Because of this big range of latency, let us scale the loops around the PAUSE instruction based on timing results at server startup. my_cpu_relax_multiplier: New variable: How many times to invoke PAUSE in a loop. Only defined for IA-32 and AMD64. my_cpu_init(): Determine with RDTSC the time to run 16 PAUSE instructions in two unrolled loops according, and based on the quicker of the two runs, initialize my_cpu_relax_multiplier. This form of calibration was suggested by Mikhail Sinyavin from Intel. LF_BACKOFF(), ut_delay(): Use my_cpu_relax_multiplier when available. ut_delay(): Define inline in my_cpu.h. UT_COMPILER_BARRIER(): Remove. This does not seem to have any effect, because in our ut_delay() implementation, no computations are being performed inside the loop. The purpose of UT_COMPILER_BARRIER() was to prohibit the compiler from reordering computations. It was not emitting any code.
Diffstat (limited to 'include')
-rw-r--r--include/my_cpu.h54
1 files changed, 41 insertions, 13 deletions
diff --git a/include/my_cpu.h b/include/my_cpu.h
index b5665fc108c..0e37eafe60e 100644
--- a/include/my_cpu.h
+++ b/include/my_cpu.h
@@ -46,10 +46,20 @@
#define HMT_high()
#endif
+#if defined __i386__ || defined __x86_64__ || defined _WIN32
+# define HAVE_PAUSE_INSTRUCTION /* added in Intel Pentium 4 */
+#endif
static inline void MY_RELAX_CPU(void)
{
-#ifdef HAVE_PAUSE_INSTRUCTION
+#ifdef _WIN32
+ /*
+ In the Win32 API, the x86 PAUSE instruction is executed by calling
+ the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+ independent way by using YieldProcessor.
+ */
+ YieldProcessor();
+#elif defined HAVE_PAUSE_INSTRUCTION
/*
According to the gcc info page, asm volatile means that the
instruction has important side-effects and must not be removed.
@@ -61,16 +71,6 @@ static inline void MY_RELAX_CPU(void)
#else
__asm__ __volatile__ ("pause");
#endif
-
-#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
- __asm__ __volatile__ ("rep; nop");
-#elif defined _WIN32
- /*
- In the Win32 API, the x86 PAUSE instruction is executed by calling
- the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
- independent way by using YieldProcessor.
- */
- YieldProcessor();
#elif defined(_ARCH_PWR8)
__ppc_get_timebase();
#else
@@ -81,6 +81,20 @@ static inline void MY_RELAX_CPU(void)
}
+#ifdef HAVE_PAUSE_INSTRUCTION
+# ifdef __cplusplus
+extern "C" {
+# endif
+extern unsigned my_cpu_relax_multiplier;
+void my_cpu_init(void);
+# ifdef __cplusplus
+}
+# endif
+#else
+# define my_cpu_relax_multiplier 200
+# define my_cpu_init() /* nothing */
+#endif
+
/*
LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel
recommends to use it in spin loops also on non-HT machines to reduce power
@@ -94,9 +108,23 @@ static inline void MY_RELAX_CPU(void)
static inline int LF_BACKOFF(void)
{
- int i;
- for (i= 0; i < 200; i++)
+ unsigned i= my_cpu_relax_multiplier;
+ while (i--)
MY_RELAX_CPU();
return 1;
}
+
+/**
+ Run a delay loop while waiting for a shared resource to be released.
+ @param delay originally, roughly microseconds on 100 MHz Intel Pentium
+*/
+static inline void ut_delay(unsigned delay)
+{
+ unsigned i= my_cpu_relax_multiplier / 4 * delay;
+ HMT_low();
+ while (i--)
+ MY_RELAX_CPU();
+ HMT_medium();
+}
+
#endif