summaryrefslogtreecommitdiff
path: root/mysys/my_rdtsc.c
diff options
context:
space:
mode:
authorMarc Alff <marc.alff@sun.com>2009-11-24 16:36:31 -0700
committerMarc Alff <marc.alff@sun.com>2009-11-24 16:36:31 -0700
commit6e01be7552aa5c5fb854d30d5d8cb61c2729b292 (patch)
tree06a753b068f7fb21faaa840cf0808981acaaabb2 /mysys/my_rdtsc.c
parentbd6467805a4d232718589f8ba0f2b84493843aba (diff)
downloadmariadb-git-6e01be7552aa5c5fb854d30d5d8cb61c2729b292.tar.gz
WL#2373 Use cycle counter for timing
Diffstat (limited to 'mysys/my_rdtsc.c')
-rw-r--r--mysys/my_rdtsc.c1004
1 files changed, 1004 insertions, 0 deletions
diff --git a/mysys/my_rdtsc.c b/mysys/my_rdtsc.c
new file mode 100644
index 00000000000..4227498b92c
--- /dev/null
+++ b/mysys/my_rdtsc.c
@@ -0,0 +1,1004 @@
+/* Copyright (C) 2008, 2009 Sun Microsystems, Inc
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ rdtsc3 -- multi-platform timer code
+ pgulutzan@mysql.com, 2005-08-29
+ modified 2008-11-02
+
+ Functions:
+
+ my_timer_cycles ulonglong cycles
+ my_timer_nanoseconds ulonglong nanoseconds
+ my_timer_microseconds ulonglong "microseconds"
+ my_timer_milliseconds ulonglong milliseconds
+ my_timer_ticks ulonglong ticks
+ my_timer_init initialization / test
+
+ We'll call the first 5 functions (the ones that return
+ a ulonglong) "my_timer_xxx" functions.
+ Each my_timer_xxx function returns a 64-bit timing value
+ since an arbitrary 'epoch' start. Since the only purpose
+ is to determine elapsed times, wall-clock time-of-day
+ is not known and not relevant.
+
+ The my_timer_init function is necessary for initializing.
+ It returns information (underlying routine name,
+ frequency, resolution, overhead) about all my_timer_xxx
+ functions. A program should call my_timer_init once,
+ use the information to decide what my_timer_xxx function
+ to use, and subsequently call that function by function
+ pointer.
+
+ A typical use would be:
+ my_timer_init() ... once, at program start
+ ...
+ time1= my_timer_xxx() ... time before start
+ [code that's timed]
+ time2= my_timer_xxx() ... time after end
+ elapsed_time= (time2 - time1) - overhead
+*/
+
+#include "my_global.h"
+#include "my_rdtsc.h"
+
+#if defined(_WIN32)
+#include <stdio.h>
+#include "windows.h"
+#else
+#include <stdio.h>
+#endif
+
+#if !defined(_WIN32)
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#include <time.h> /* for clock_gettime */
+#else
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#elif defined(HAVE_TIME_H)
+#include <time.h>
+#endif
+#endif
+#endif
+
+#if defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL)
+#include <asm/msr.h> /* for rdtscll */
+#endif
+
+#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME)
+#include <sys/timeb.h> /* for ftime */
+#endif
+
+#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES)
+#include <sys/times.h> /* for times */
+#endif
+
+#if defined(__NETWARE__)
+#include <nks/time.h> /* for NXGetTime */
+#endif
+
+#if defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H)
+#include <ia64intrin.h> /* for __GetReg */
+#endif
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__SUNPRO_CC) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7)
+extern "C" ulonglong my_timer_cycles_il_sparc64();
+#elif defined(__SUNPRO_CC) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7)
+extern "C" ulonglong my_timer_cycles_il_sparc32();
+#elif defined(__SUNPRO_CC) && defined(__i386) && defined(_ILP32)
+extern "C" ulonglong my_timer_cycles_il_i386();
+#elif defined(__SUNPRO_CC) && defined(__x86_64) && defined(_LP64)
+extern "C" ulonglong my_timer_cycles_il_x86_64();
+#elif defined(__SUNPRO_C) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7)
+ulonglong my_timer_cycles_il_sparc64();
+#elif defined(__SUNPRO_C) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7)
+ulonglong my_timer_cycles_il_sparc32();
+#elif defined(__SUNPRO_C) && defined(__i386) && defined(_ILP32)
+ulonglong my_timer_cycles_il_i386();
+#elif defined(__SUNPRO_C) && defined(__x86_64) && defined(_LP64)
+ulonglong my_timer_cycles_il_x86_64();
+#endif
+
+#if defined(__INTEL_COMPILER)
+/*
+ icc warning #1011 is:
+ missing return statement at end of non-void function
+*/
+#pragma warning (disable:1011)
+#endif
+
+/*
+ For cycles, we depend on RDTSC for x86 platforms,
+ or on time buffer (which is not really a cycle count
+ but a separate counter with less than nanosecond
+ resolution) for most PowerPC platforms, or on
+ gethrtime which is okay for hpux and solaris, or on
+ clock_gettime(CLOCK_SGI_CYCLE) for Irix platforms,
+ or on read_real_time for aix platforms. There is
+ nothing for Alpha platforms, they would be tricky.
+*/
+
+ulonglong my_timer_cycles(void)
+{
+#if defined(__GNUC__) && defined(__i386__)
+ /* This works much better if compiled with "gcc -O3". */
+ ulonglong result;
+ __asm__ __volatile__ ("rdtsc" : "=A" (result));
+ return result;
+#elif defined(__SUNPRO_C) && defined(__i386)
+ __asm("rdtsc");
+#elif defined(__GNUC__) && defined(__x86_64__)
+ ulonglong result;
+ __asm__ __volatile__ ("rdtsc\n\t" \
+ "shlq $32,%%rdx\n\t" \
+ "orq %%rdx,%%rax"
+ : "=a" (result) :: "%edx");
+ return result;
+#elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL)
+ {
+ ulonglong result;
+ rdtscll(result);
+ return result;
+ }
+#elif defined(_WIN32) && defined(_M_IX86)
+ __asm {rdtsc};
+#elif defined(_WIN64) && defined(_M_X64)
+ /* For 64-bit Windows: unsigned __int64 __rdtsc(); */
+ return __rdtsc();
+#elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H)
+ return (ulonglong) __getReg(_IA64_REG_AR_ITC); /* (3116) */
+#elif defined(__GNUC__) && defined(__ia64__)
+ {
+ ulonglong result;
+ __asm __volatile__ ("mov %0=ar.itc" : "=r" (result));
+ return result;
+ }
+#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64))
+ {
+ ulonglong result;
+ __asm __volatile__ ("mftb %0" : "=r" (result));
+ return result;
+ }
+#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64))
+ {
+ /*
+ mftbu means "move from time-buffer-upper to result".
+ The loop is saying: x1=upper, x2=lower, x3=upper,
+ if x1!=x3 there was an overflow so repeat.
+ */
+ unsigned int x1, x2, x3;
+ ulonglong result;
+ for (;;)
+ {
+ __asm __volatile__ ( "mftbu %0" : "=r"(x1) );
+ __asm __volatile__ ( "mftb %0" : "=r"(x2) );
+ __asm __volatile__ ( "mftbu %0" : "=r"(x3) );
+ if (x1 == x3) break;
+ }
+ result = x1;
+ return ( result << 32 ) | x2;
+ }
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7)
+ return (my_timer_cycles_il_sparc64());
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7)
+ return (my_timer_cycles_il_sparc32());
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32)
+ /* This is probably redundant for __SUNPRO_C. */
+ return (my_timer_cycles_il_i386());
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64)
+ return (my_timer_cycles_il_x86_64());
+#elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2)
+ {
+ ulonglong result;
+ __asm __volatile__ ("rd %%tick,%0" : "=r" (result));
+ return result;
+ }
+#elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2)
+ {
+ union {
+ ulonglong wholeresult;
+ struct {
+ ulong high;
+ ulong low;
+ } splitresult;
+ } result;
+ __asm __volatile__ ("rd %%tick,%1; srlx %1,32,%0" : "=r" (result.splitresult.high), "=r" (result.splitresult.low));
+ return result.wholeresult;
+ }
+#elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE)
+ {
+ struct timespec tp;
+ clock_gettime(CLOCK_SGI_CYCLE, &tp);
+ return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec;
+ }
+#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME)
+ /* gethrtime may appear as either cycle or nanosecond counter */
+ return (ulonglong) gethrtime();
+#else
+ return 0;
+#endif
+}
+
+#if defined(__INTEL_COMPILER)
+/* re-enable warning#1011 which was only for my_timer_cycles() */
+/* There may be an icc bug which means we must leave disabled. */
+#pragma warning (default:1011)
+#endif
+
+/*
+ For nanoseconds, most platforms have nothing available that
+ (a) doesn't require bringing in a 40-kb librt.so library
+ (b) really has nanosecond resolution.
+*/
+
+ulonglong my_timer_nanoseconds(void)
+{
+#if defined(HAVE_READ_REAL_TIME)
+ {
+ timebasestruct_t tr;
+ read_real_time(&tr, TIMEBASE_SZ);
+ return (ulonglong) tr.tb_high * 1000000000 + (ulonglong) tr.tb_low;
+ }
+#elif defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_REALTIME)
+ {
+ struct timespec tp;
+ clock_gettime(CLOCK_REALTIME, &tp);
+ return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec;
+ }
+#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME)
+ /* SunOS 5.10+, Solaris, HP-UX: hrtime_t gethrtime(void) */
+ return (ulonglong) gethrtime();
+#elif defined(__NETWARE__)
+ {
+ NXTime_t tm;
+ NXGetTime(NX_SINCE_1970, NX_NSECONDS, &tm);
+ return (ulonglong) tm;
+ }
+#elif defined(__APPLE__) && defined(__MACH__)
+ {
+ ulonglong tm;
+ static mach_timebase_info_data_t timebase_info= {0,0};
+ if (timebase_info.denom == 0)
+ (void) mach_timebase_info(&timebase_info);
+ tm= mach_absolute_time();
+ return (tm * timebase_info.numer) / timebase_info.denom;
+ }
+#else
+ return 0;
+#endif
+}
+
+/*
+ For microseconds, gettimeofday() is available on
+ almost all platforms. On Windows we use
+ QueryPerformanceCounter which will usually tick over
+ 3.5 million times per second, and we don't throw
+ away the extra precision. (On Windows Server 2003
+ the frequency is same as the cycle frequency.)
+*/
+
+ulonglong my_timer_microseconds(void)
+{
+#if defined(HAVE_GETTIMEOFDAY)
+ {
+ static ulonglong last_value= 0;
+ struct timeval tv;
+ if (gettimeofday(&tv, NULL) == 0)
+ last_value= (ulonglong) tv.tv_sec * 1000000 + (ulonglong) tv.tv_usec;
+ else
+ {
+ /*
+ There are reports that gettimeofday(2) can have intermittent failures
+ on some platform, see for example Bug#36819.
+ We are not trying again or looping, just returning the best value possible
+ under the circumstances ...
+ */
+ last_value++;
+ }
+ return last_value;
+ }
+#elif defined(_WIN32)
+ {
+ /* QueryPerformanceCounter usually works with about 1/3 microsecond. */
+ LARGE_INTEGER t_cnt;
+
+ QueryPerformanceCounter(&t_cnt);
+ return (ulonglong) t_cnt.QuadPart;
+ }
+#elif defined(__NETWARE__)
+ {
+ NXTime_t tm;
+ NXGetTime(NX_SINCE_1970, NX_USECONDS, &tm);
+ return (ulonglong) tm;
+ }
+#else
+ return 0;
+#endif
+}
+
+/*
+ For milliseconds, we use ftime() if it's supported
+ or time()*1000 if it's not. With modern versions of
+ Windows and with HP Itanium, resolution is 10-15
+ milliseconds.
+*/
+
+ulonglong my_timer_milliseconds(void)
+{
+#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME)
+ /* ftime() is obsolete but maybe the platform is old */
+ struct timeb ft;
+ ftime(&ft);
+ return (ulonglong)ft.time * 1000 + (ulonglong)ft.millitm;
+#elif defined(HAVE_TIME)
+ return (ulonglong) time(NULL) * 1000;
+#elif defined(_WIN32)
+ FILETIME ft;
+ GetSystemTimeAsFileTime( &ft );
+ return ((ulonglong)ft.dwLowDateTime +
+ (((ulonglong)ft.dwHighDateTime) << 32))/10000;
+#elif defined(__NETWARE__)
+ {
+ NXTime_t tm;
+ NXGetTime(NX_SINCE_1970, NX_MSECONDS, &tm);
+ return (ulonglong)tm;
+ }
+#else
+ return 0;
+#endif
+}
+
+/*
+ For ticks, which we handle with times(), the frequency
+ is usually 100/second and the overhead is surprisingly
+ bad, sometimes even worse than gettimeofday's overhead.
+*/
+
+ulonglong my_timer_ticks(void)
+{
+#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES)
+ {
+ struct tms times_buf;
+ return (ulonglong) times(&times_buf);
+ }
+#elif defined(__NETWARE__)
+ {
+ NXTime_t tm;
+ NXGetTime(NX_SINCE_BOOT, NX_TICKS, &tm);
+ return (ulonglong) tm;
+ }
+#elif defined(_WIN32)
+ return (ulonglong) GetTickCount();
+#else
+ return 0;
+#endif
+}
+
+/*
+ The my_timer_init() function and its sub-functions
+ have several loops which call timers. If there's
+ something wrong with a timer -- which has never
+ happened in tests -- we want the loop to end after
+ an arbitrary number of iterations, and my_timer_info
+ will show a discouraging result. The arbitrary
+ number is 1,000,000.
+*/
+#define MY_TIMER_ITERATIONS 1000000
+
+/*
+ Calculate overhead. Called from my_timer_init().
+ Usually best_timer_overhead = cycles.overhead or
+ nanoseconds.overhead, so returned amount is in
+ cycles or nanoseconds. We repeat the calculation
+ ten times, so that we can disregard effects of
+ caching or interrupts. Result is quite consistent
+ for cycles, at least. But remember it's a minimum.
+*/
+
+static void my_timer_init_overhead(ulonglong *overhead,
+ ulonglong (*cycle_timer)(void),
+ ulonglong (*this_timer)(void),
+ ulonglong best_timer_overhead)
+{
+ ulonglong time1, time2;
+ int i;
+
+ /* *overhead, least of 20 calculations - cycles.overhead */
+ for (i= 0, *overhead= 1000000000; i < 20; ++i)
+ {
+ time1= cycle_timer();
+ this_timer(); /* rather than 'time_tmp= timer();' */
+ time2= cycle_timer() - time1;
+ if (*overhead > time2)
+ *overhead= time2;
+ }
+ *overhead-= best_timer_overhead;
+}
+
+/*
+ Calculate Resolution. Called from my_timer_init().
+ If a timer goes up by jumps, e.g. 1050, 1075, 1100, ...
+ then the best resolution is the minimum jump, e.g. 25.
+ If it's always divisible by 1000 then it's just a
+ result of multiplication of a lower-precision timer
+ result, e.g. nanoseconds are often microseconds * 1000.
+ If the minimum jump is less than an arbitrary passed
+ figure (a guess based on maximum overhead * 2), ignore.
+ Usually we end up with nanoseconds = 1 because it's too
+ hard to detect anything <= 100 nanoseconds.
+ Often GetTickCount() has resolution = 15.
+ We don't check with ticks because they take too long.
+*/
+static ulonglong my_timer_init_resolution(ulonglong (*this_timer)(void),
+ ulonglong overhead_times_2)
+{
+ ulonglong time1, time2;
+ ulonglong best_jump;
+ int i, jumps, divisible_by_1000, divisible_by_1000000;
+
+ divisible_by_1000= divisible_by_1000000= 0;
+ best_jump= 1000000;
+ for (i= jumps= 0; jumps < 3 && i < MY_TIMER_ITERATIONS * 10; ++i)
+ {
+ time1= this_timer();
+ time2= this_timer();
+ time2-= time1;
+ if (time2)
+ {
+ ++jumps;
+ if (!(time2 % 1000))
+ {
+ ++divisible_by_1000;
+ if (!(time2 % 1000000))
+ ++divisible_by_1000000;
+ }
+ if (best_jump > time2)
+ best_jump= time2;
+ /* For milliseconds, one jump is enough. */
+ if (overhead_times_2 == 0)
+ break;
+ }
+ }
+ if (jumps == 3)
+ {
+ if (jumps == divisible_by_1000000)
+ return 1000000;
+ if (jumps == divisible_by_1000)
+ return 1000;
+ }
+ if (best_jump > overhead_times_2)
+ return best_jump;
+ return 1;
+}
+
+/*
+ Calculate cycle frequency by seeing how many cycles pass
+ in a 200-microsecond period. I tried with 10-microsecond
+ periods originally, and the result was often very wrong.
+*/
+
+static ulonglong my_timer_init_frequency(MY_TIMER_INFO *mti)
+{
+ int i;
+ ulonglong time1, time2, time3, time4;
+ time1= my_timer_cycles();
+ time2= my_timer_microseconds();
+ time3= time2; /* Avoids a Microsoft/IBM compiler warning */
+ for (i= 0; i < MY_TIMER_ITERATIONS; ++i)
+ {
+ time3= my_timer_microseconds();
+ if (time3 - time2 > 200) break;
+ }
+ time4= my_timer_cycles() - mti->cycles.overhead;
+ time4-= mti->microseconds.overhead;
+ return (mti->microseconds.frequency * (time4 - time1)) / (time3 - time2);
+}
+
+/*
+ Call my_timer_init before the first call to my_timer_xxx().
+ If something must be initialized, it happens here.
+ Set: what routine is being used e.g. "asm_x86"
+ Set: function, overhead, actual frequency, resolution.
+*/
+
+void my_timer_init(MY_TIMER_INFO *mti)
+{
+ ulonglong (*best_timer)(void);
+ ulonglong best_timer_overhead;
+ ulonglong time1, time2;
+ int i;
+
+ /* cycles */
+ mti->cycles.frequency= 1000000000;
+#if defined(__GNUC__) && defined(__i386__)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86;
+#elif defined(__SUNPRO_C) && defined(__i386)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86;
+#elif defined(__GNUC__) && defined(__x86_64__)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_64;
+#elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL)
+ mti->cycles.routine= MY_TIMER_ROUTINE_RDTSCLL;
+#elif defined(_WIN32) && defined(_M_IX86)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_WIN;
+#elif defined(_WIN64) && defined(_M_X64)
+ mti->cycles.routine= MY_TIMER_ROUTINE_RDTSC;
+#elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64;
+#elif defined(__GNUC__) && defined(__ia64__)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64;
+#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64))
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC64;
+#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64))
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC;
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC64;
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC32;
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_I386;
+#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_X86_64;
+#elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC64;
+#elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2)
+ mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC32;
+#elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE)
+ mti->cycles.routine= MY_TIMER_ROUTINE_SGI_CYCLE;
+#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME)
+ mti->cycles.routine= MY_TIMER_ROUTINE_GETHRTIME;
+#else
+ mti->cycles.routine= 0;
+#endif
+
+ if (!mti->cycles.routine || !my_timer_cycles())
+ {
+ mti->cycles.routine= 0;
+ mti->cycles.resolution= 0;
+ mti->cycles.frequency= 0;
+ mti->cycles.overhead= 0;
+ }
+
+ /* nanoseconds */
+ mti->nanoseconds.frequency= 1000000000; /* initial assumption */
+#if defined(HAVE_READ_REAL_TIME)
+ mti->nanoseconds.routine= MY_TIMER_ROUTINE_READ_REAL_TIME;
+#elif defined(HAVE_CLOCK_GETTIME)
+ mti->nanoseconds.routine= MY_TIMER_ROUTINE_CLOCK_GETTIME;
+#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME)
+ mti->nanoseconds.routine= MY_TIMER_ROUTINE_GETHRTIME;
+#elif defined(__NETWARE__)
+ mti->nanoseconds.routine= MY_TIMER_ROUTINE_NXGETTIME;
+#elif defined(__APPLE__) && defined(__MACH__)
+ mti->nanoseconds.routine= MY_TIMER_ROUTINE_MACH_ABSOLUTE_TIME;
+#else
+ mti->nanoseconds.routine= 0;
+#endif
+ if (!mti->nanoseconds.routine || !my_timer_nanoseconds())
+ {
+ mti->nanoseconds.routine= 0;
+ mti->nanoseconds.resolution= 0;
+ mti->nanoseconds.frequency= 0;
+ mti->nanoseconds.overhead= 0;
+ }
+
+ /* microseconds */
+ mti->microseconds.frequency= 1000000; /* initial assumption */
+#if defined(HAVE_GETTIMEOFDAY)
+ mti->microseconds.routine= MY_TIMER_ROUTINE_GETTIMEOFDAY;
+#elif defined(_WIN32)
+ {
+ LARGE_INTEGER li;
+ /* Windows: typical frequency = 3579545, actually 1/3 microsecond. */
+ if (!QueryPerformanceFrequency(&li))
+ mti->microseconds.routine= 0;
+ else
+ {
+ mti->microseconds.frequency= li.QuadPart;
+ mti->microseconds.routine= MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER;
+ }
+ }
+#elif defined(__NETWARE__)
+ mti->microseconds.routine= MY_TIMER_ROUTINE_NXGETTIME;
+#else
+ mti->microseconds.routine= 0;
+#endif
+ if (!mti->microseconds.routine || !my_timer_microseconds())
+ {
+ mti->microseconds.routine= 0;
+ mti->microseconds.resolution= 0;
+ mti->microseconds.frequency= 0;
+ mti->microseconds.overhead= 0;
+ }
+
+ /* milliseconds */
+ mti->milliseconds.frequency= 1000; /* initial assumption */
+#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME)
+ mti->milliseconds.routine= MY_TIMER_ROUTINE_FTIME;
+#elif defined(_WIN32)
+ mti->milliseconds.routine= MY_TIMER_ROUTINE_GETSYSTEMTIMEASFILETIME;
+#elif defined(__NETWARE__)
+ mti->milliseconds.routine= MY_TIMER_ROUTINE_NXGETTIME;
+#elif defined(HAVE_TIME)
+ mti->milliseconds.routine= MY_TIMER_ROUTINE_TIME;
+#else
+ mti->milliseconds.routine= 0;
+#endif
+ if (!mti->milliseconds.routine || !my_timer_milliseconds())
+ {
+ mti->milliseconds.routine= 0;
+ mti->milliseconds.resolution= 0;
+ mti->milliseconds.frequency= 0;
+ mti->milliseconds.overhead= 0;
+ }
+
+ /* ticks */
+ mti->ticks.frequency= 100; /* permanent assumption */
+#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES)
+ mti->ticks.routine= MY_TIMER_ROUTINE_TIMES;
+#elif defined(__NETWARE__)
+ mti->ticks.routine= MY_TIMER_ROUTINE_NXGETTIME;
+#elif defined(_WIN32)
+ mti->ticks.routine= MY_TIMER_ROUTINE_GETTICKCOUNT;
+#else
+ mti->ticks.routine= 0;
+#endif
+ if (!mti->ticks.routine || !my_timer_ticks())
+ {
+ mti->ticks.routine= 0;
+ mti->ticks.resolution= 0;
+ mti->ticks.frequency= 0;
+ mti->ticks.overhead= 0;
+ }
+
+ /*
+ Calculate overhead in terms of the timer that
+ gives the best resolution: cycles or nanoseconds.
+ I doubt it ever will be as bad as microseconds.
+ */
+ if (mti->cycles.routine)
+ best_timer= &my_timer_cycles;
+ else
+ {
+ if (mti->nanoseconds.routine)
+ {
+ best_timer= &my_timer_nanoseconds;
+ }
+ else
+ best_timer= &my_timer_microseconds;
+ }
+
+ /* best_timer_overhead = least of 20 calculations */
+ for (i= 0, best_timer_overhead= 1000000000; i < 20; ++i)
+ {
+ time1= best_timer();
+ time2= best_timer() - time1;
+ if (best_timer_overhead > time2)
+ best_timer_overhead= time2;
+ }
+ if (mti->cycles.routine)
+ my_timer_init_overhead(&mti->cycles.overhead,
+ best_timer,
+ &my_timer_cycles,
+ best_timer_overhead);
+ if (mti->nanoseconds.routine)
+ my_timer_init_overhead(&mti->nanoseconds.overhead,
+ best_timer,
+ &my_timer_nanoseconds,
+ best_timer_overhead);
+ if (mti->microseconds.routine)
+ my_timer_init_overhead(&mti->microseconds.overhead,
+ best_timer,
+ &my_timer_microseconds,
+ best_timer_overhead);
+ if (mti->milliseconds.routine)
+ my_timer_init_overhead(&mti->milliseconds.overhead,
+ best_timer,
+ &my_timer_milliseconds,
+ best_timer_overhead);
+ if (mti->ticks.routine)
+ my_timer_init_overhead(&mti->ticks.overhead,
+ best_timer,
+ &my_timer_ticks,
+ best_timer_overhead);
+
+/*
+ Calculate resolution for nanoseconds or microseconds
+ or milliseconds, by seeing if it's always divisible
+ by 1000, and by noticing how much jumping occurs.
+ For ticks, just assume the resolution is 1.
+*/
+ if (mti->cycles.routine)
+ mti->cycles.resolution= 1;
+ if (mti->nanoseconds.routine)
+ mti->nanoseconds.resolution=
+ my_timer_init_resolution(&my_timer_nanoseconds, 20000);
+ if (mti->microseconds.routine)
+ mti->microseconds.resolution=
+ my_timer_init_resolution(&my_timer_microseconds, 20);
+ if (mti->milliseconds.routine)
+ {
+ if (mti->milliseconds.routine == MY_TIMER_ROUTINE_TIME)
+ mti->milliseconds.resolution= 1000;
+ else
+ mti->milliseconds.resolution=
+ my_timer_init_resolution(&my_timer_milliseconds, 0);
+ }
+ if (mti->ticks.routine)
+ mti->ticks.resolution= 1;
+
+/*
+ Calculate cycles frequency,
+ if we have both a cycles routine and a microseconds routine.
+ In tests, this usually results in a figure within 2% of
+ what "cat /proc/cpuinfo" says.
+ If the microseconds routine is QueryPerformanceCounter
+ (i.e. it's Windows), and the microseconds frequency is >
+ 500,000,000 (i.e. it's Windows Server so it uses RDTSC)
+ and the microseconds resolution is > 100 (i.e. dreadful),
+ then calculate cycles frequency = microseconds frequency.
+*/
+ if (mti->cycles.routine
+ && mti->microseconds.routine)
+ {
+ if (mti->microseconds.routine ==
+ MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER
+ && mti->microseconds.frequency > 500000000
+ && mti->microseconds.resolution > 100)
+ mti->cycles.frequency= mti->microseconds.frequency;
+ else
+ {
+ ulonglong time1, time2;
+ time1= my_timer_init_frequency(mti);
+ /* Repeat once in case there was an interruption. */
+ time2= my_timer_init_frequency(mti);
+ if (time1 < time2) mti->cycles.frequency= time1;
+ else mti->cycles.frequency= time2;
+ }
+ }
+
+/*
+ Calculate milliseconds frequency =
+ (cycles-frequency/#-of-cycles) * #-of-milliseconds,
+ if we have both a milliseconds routine and a cycles
+ routine.
+ This will be inaccurate if milliseconds resolution > 1.
+ This is probably only useful when testing new platforms.
+*/
+ if (mti->milliseconds.routine
+ && mti->milliseconds.resolution < 1000
+ && mti->microseconds.routine
+ && mti->cycles.routine)
+ {
+ int i;
+ ulonglong time1, time2, time3, time4;
+ time1= my_timer_cycles();
+ time2= my_timer_milliseconds();
+ time3= time2; /* Avoids a Microsoft/IBM compiler warning */
+ for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i)
+ {
+ time3= my_timer_milliseconds();
+ if (time3 - time2 > 10) break;
+ }
+ time4= my_timer_cycles();
+ mti->milliseconds.frequency=
+ (mti->cycles.frequency * (time3 - time2)) / (time4 - time1);
+ }
+
+/*
+ Calculate ticks.frequency =
+ (cycles-frequency/#-of-cycles * #-of-ticks,
+ if we have both a ticks routine and a cycles
+ routine,
+ This is probably only useful when testing new platforms.
+*/
+ if (mti->ticks.routine
+ && mti->microseconds.routine
+ && mti->cycles.routine)
+ {
+ int i;
+ ulonglong time1, time2, time3, time4;
+ time1= my_timer_cycles();
+ time2= my_timer_ticks();
+ time3= time2; /* Avoids a Microsoft/IBM compiler warning */
+ for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i)
+ {
+ time3= my_timer_ticks();
+ if (time3 - time2 > 10) break;
+ }
+ time4= my_timer_cycles();
+ mti->ticks.frequency=
+ (mti->cycles.frequency * (time3 - time2)) / (time4 - time1);
+ }
+}
+
+/*
+ Additional Comments
+ -------------------
+
+ This is for timing, i.e. finding out how long a piece of code
+ takes. If you want time of day matching a wall clock, the
+ my_timer_xxx functions won't help you.
+
+ The best timer is the one with highest frequency, lowest
+ overhead, and resolution=1. The my_timer_info() routine will tell
+ you at runtime which timer that is. Usually it will be
+ my_timer_cycles() but be aware that, although it's best,
+ it has possible flaws and dangers. Depending on platform:
+ - The frequency might change. We don't test for this. It
+ happens on laptops for power saving, and on blade servers
+ for avoiding overheating.
+ - The overhead that my_timer_init() returns is the minimum.
+ In fact it could be slightly greater because of caching or
+ because you call the routine by address, as recommended.
+ It could be hugely greater if there's an interrupt.
+ - The x86 cycle counter, RDTSC doesn't "serialize". That is,
+ if there is out-of-order execution, rdtsc might be processed
+ after an instruction that logically follows it.
+ (We could force serialization, but that would be slower.)
+ - It is possible to set a flag which renders RDTSC
+ inoperative. Somebody responsible for the kernel
+ of the operating system would have to make this
+ decision. For the platforms we've tested with, there's
+ no such problem.
+ - With a multi-processor arrangement, it's possible
+ to get the cycle count from one processor in
+ thread X, and the cycle count from another processor
+ in thread Y. They may not always be in synch.
+ - You can't depend on a cycle counter being available for
+ all platforms. On Alphas, the
+ cycle counter is only 32-bit, so it would overflow quickly,
+ so we don't bother with it. On platforms that we haven't
+ tested, there might be some if/endif combination that we
+ didn't expect, or some assembler routine that we didn't
+ supply.
+
+ The recommended way to use the timer routines is:
+ 1. Somewhere near the beginning of the program, call
+ my_timer_init(). This should only be necessary once,
+ although you can call it again if you think that the
+ frequency has changed.
+ 2. Determine the best timer based on frequency, resolution,
+ overhead -- all things that my_timer_init() returns.
+ Preserve the address of the timer and the my_timer_into
+ results in an easily-accessible place.
+ 3. Instrument the code section that you're monitoring, thus:
+ time1= my_timer_xxx();
+ Instrumented code;
+ time2= my_timer_xxx();
+ elapsed_time= (time2 - time1) - overhead;
+ If the timer is always on, then overhead is always there,
+ so don't subtract it.
+ 4. Save the elapsed time, or add it to a totaller.
+ 5. When all timing processes are complete, transfer the
+ saved / totalled elapsed time to permanent storage.
+ Optionally you can convert cycles to microseconds at
+ this point. (Don't do so every time you calculate
+ elapsed_time! That would waste time and lose precision!)
+ For converting cycles to microseconds, use the frequency
+ that my_timer_init() returns. You'll also need to convert
+ if the my_timer_microseconds() function is the Windows
+ function QueryPerformanceCounter(), since that's sometimes
+ a counter with precision slightly better than microseconds.
+
+ Since we recommend calls by function pointer, we supply
+ no inline functions.
+
+ Some comments on the many candidate routines for timing ...
+
+ clock() -- We don't use because it would overflow frequently.
+
+ clock_gettime() -- Often we don't use this even when it exists.
+ In configure.in, we use AC_CHECK_FUNCS(clock_gettime). Not
+ AC_CHECK_LIB(rc,clock_gettime)
+ AC_CHECK_FUNCS(clock_gettime)
+ If we had the above lines in configure.in, we'd have to use
+ /usr/lib/librt.so or /usr/lib64/librt.so when linking, and
+ the size of librt.so is 40KB. In tests, clock_gettime often
+ had resolution = 1000.
+
+ ftime() -- A "man ftime" says: "This function is obsolete.
+ Don't use it." On every platform that we tested, if ftime()
+ was available, then so was gettimeofday(), and gettimeofday()
+ overhead was always at least as good as ftime() overhead.
+
+ gettimeofday() -- available on most platforms, though not
+ on Windows. There is a hardware timer (sometimes a Programmable
+ Interrupt Timer or "PIT") (sometimes a "HPET") used for
+ interrupt generation. When it interrupts (a "tick" or "jiffy",
+ typically 1 centisecond) it sets xtime. For gettimeofday, a
+ Linux kernel routine usually gets xtime and then gets rdtsc
+ to get elapsed nanoseconds since the last tick. On Red Hat
+ Enterprise Linux 3, there was once a bug which caused the
+ resolution to be 1000, i.e. one centisecond. We never check
+ for time-zone change.
+
+ getnstimeofday() -- something to watch for in future Linux
+
+ do_gettimeofday() -- exists on Linux but not for "userland"
+
+ get_cycles() -- a multi-platform function, worth watching
+ in future Linux versions. But we found platform-specific
+ functions which were better documented in operating-system
+ manuals. And get_cycles() can fail or return a useless
+ 32-bit number. It might be available on some platforms,
+ such as arm, which we didn't test. Using
+ "include <linux/timex.h>" or "include <asm/timex.h>"
+ can lead to autoconf or compile errors, depending on system.
+
+ rdtsc, __rdtsc, rdtscll: available for x86 with Linux BSD,
+ Solaris, Windows. See "possible flaws and dangers" comments.
+
+ times(): what we use for ticks. Should just read the last
+ (xtime) tick count, therefore should be fast, but usually
+ isn't.
+
+ GetTickCount(): we use this for my_timer_ticks() on
+ Windows. Actually it really is a tick counter, so resolution
+ >= 10 milliseconds unless you have a very old Windows version.
+ With Windows 95 or 98 or ME, timeGetTime() has better resolution than
+ GetTickCount (1ms rather than 55ms). But with Windows NT or XP or 2000,
+ they're both getting from a variable in the Process Environment Block
+ (PEB), and the variable is set by the programmable interrupt timer, so
+ the resolution is the same (usually 10-15 milliseconds). Also timeGetTime
+ is slower on old machines:
+ http://www.doumo.jp/aon-java/jsp/postgretips/tips.jsp?tips=74.
+ Also timeGetTime requires linking winmm.lib,
+ Therefore we use GetTickCount.
+ It will overflow every 49 days because the return is 32-bit.
+ There is also a GetTickCount64 but it requires Vista or Windows Server 2008.
+ (As for GetSystemTimeAsFileTime, its precision is spurious, it
+ just reads the tick variable like the other functions do.
+ However, we don't expect it to overflow every 49 days, so we
+ will prefer it for my_timer_milliseconds().)
+
+ QueryPerformanceCounter() we use this for my_timer_microseconds()
+ on Windows. 1-PIT-tick (often 1/3-microsecond). Usually reads
+ the PIT so it's slow. On some Windows variants, uses RDTSC.
+
+ GetLocalTime() this is available on Windows but we don't use it.
+
+ getclock(): documented for Alpha, but not found during tests.
+
+ mach_absolute_time() and UpTime() are recommended for Apple.
+ Inititally they weren't tried, because asm_ppc seems to do the job.
+ But now we use mach_absolute_time for nanoseconds.
+
+ Any clock-based timer can be affected by NPT (ntpd program),
+ which means:
+ - full-second correction can occur for leap second
+ - tiny corrections can occcur approimately every 11 minutes
+ (but I think they only affect the RTC which isn't the PIT).
+
+ We define "precision" as "frequency" and "high precision" is
+ "frequency better than 1 microsecond". We define "resolution"
+ as a synonym for "granularity". We define "accuracy" as
+ "closeness to the truth" as established by some authoritative
+ clock, but we can't measure accuracy.
+
+ Do not expect any of our timers to be monotonic; we
+ won't guarantee that they return constantly-increasing
+ unique numbers.
+
+ We tested with AIX, Solaris (x86 + Sparc), Linux (x86 +
+ Itanium), Windows, 64-bit Windows, QNX, FreeBSD, HPUX,
+ Irix, Mac. We didn't test with NetWare or SCO.
+
+*/
+