diff options
author | Marc Alff <marc.alff@sun.com> | 2009-11-24 16:36:31 -0700 |
---|---|---|
committer | Marc Alff <marc.alff@sun.com> | 2009-11-24 16:36:31 -0700 |
commit | 6e01be7552aa5c5fb854d30d5d8cb61c2729b292 (patch) | |
tree | 06a753b068f7fb21faaa840cf0808981acaaabb2 /mysys/my_rdtsc.c | |
parent | bd6467805a4d232718589f8ba0f2b84493843aba (diff) | |
download | mariadb-git-6e01be7552aa5c5fb854d30d5d8cb61c2729b292.tar.gz |
WL#2373 Use cycle counter for timing
Diffstat (limited to 'mysys/my_rdtsc.c')
-rw-r--r-- | mysys/my_rdtsc.c | 1004 |
1 files changed, 1004 insertions, 0 deletions
diff --git a/mysys/my_rdtsc.c b/mysys/my_rdtsc.c new file mode 100644 index 00000000000..4227498b92c --- /dev/null +++ b/mysys/my_rdtsc.c @@ -0,0 +1,1004 @@ +/* Copyright (C) 2008, 2009 Sun Microsystems, Inc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + rdtsc3 -- multi-platform timer code + pgulutzan@mysql.com, 2005-08-29 + modified 2008-11-02 + + Functions: + + my_timer_cycles ulonglong cycles + my_timer_nanoseconds ulonglong nanoseconds + my_timer_microseconds ulonglong "microseconds" + my_timer_milliseconds ulonglong milliseconds + my_timer_ticks ulonglong ticks + my_timer_init initialization / test + + We'll call the first 5 functions (the ones that return + a ulonglong) "my_timer_xxx" functions. + Each my_timer_xxx function returns a 64-bit timing value + since an arbitrary 'epoch' start. Since the only purpose + is to determine elapsed times, wall-clock time-of-day + is not known and not relevant. + + The my_timer_init function is necessary for initializing. + It returns information (underlying routine name, + frequency, resolution, overhead) about all my_timer_xxx + functions. A program should call my_timer_init once, + use the information to decide what my_timer_xxx function + to use, and subsequently call that function by function + pointer. + + A typical use would be: + my_timer_init() ... once, at program start + ... + time1= my_timer_xxx() ... time before start + [code that's timed] + time2= my_timer_xxx() ... time after end + elapsed_time= (time2 - time1) - overhead +*/ + +#include "my_global.h" +#include "my_rdtsc.h" + +#if defined(_WIN32) +#include <stdio.h> +#include "windows.h" +#else +#include <stdio.h> +#endif + +#if !defined(_WIN32) +#if TIME_WITH_SYS_TIME +#include <sys/time.h> +#include <time.h> /* for clock_gettime */ +#else +#if HAVE_SYS_TIME_H +#include <sys/time.h> +#elif defined(HAVE_TIME_H) +#include <time.h> +#endif +#endif +#endif + +#if defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) +#include <asm/msr.h> /* for rdtscll */ +#endif + +#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) +#include <sys/timeb.h> /* for ftime */ +#endif + +#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) +#include <sys/times.h> /* for times */ +#endif + +#if defined(__NETWARE__) +#include <nks/time.h> /* for NXGetTime */ +#endif + +#if defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) +#include <ia64intrin.h> /* for __GetReg */ +#endif + +#if defined(__APPLE__) && defined(__MACH__) +#include <mach/mach_time.h> +#endif + +#if defined(__SUNPRO_CC) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) +extern "C" ulonglong my_timer_cycles_il_sparc64(); +#elif defined(__SUNPRO_CC) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7) +extern "C" ulonglong my_timer_cycles_il_sparc32(); +#elif defined(__SUNPRO_CC) && defined(__i386) && defined(_ILP32) +extern "C" ulonglong my_timer_cycles_il_i386(); +#elif defined(__SUNPRO_CC) && defined(__x86_64) && defined(_LP64) +extern "C" ulonglong my_timer_cycles_il_x86_64(); +#elif defined(__SUNPRO_C) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) +ulonglong my_timer_cycles_il_sparc64(); +#elif defined(__SUNPRO_C) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7) +ulonglong my_timer_cycles_il_sparc32(); +#elif defined(__SUNPRO_C) && defined(__i386) && defined(_ILP32) +ulonglong my_timer_cycles_il_i386(); +#elif defined(__SUNPRO_C) && defined(__x86_64) && defined(_LP64) +ulonglong my_timer_cycles_il_x86_64(); +#endif + +#if defined(__INTEL_COMPILER) +/* + icc warning #1011 is: + missing return statement at end of non-void function +*/ +#pragma warning (disable:1011) +#endif + +/* + For cycles, we depend on RDTSC for x86 platforms, + or on time buffer (which is not really a cycle count + but a separate counter with less than nanosecond + resolution) for most PowerPC platforms, or on + gethrtime which is okay for hpux and solaris, or on + clock_gettime(CLOCK_SGI_CYCLE) for Irix platforms, + or on read_real_time for aix platforms. There is + nothing for Alpha platforms, they would be tricky. +*/ + +ulonglong my_timer_cycles(void) +{ +#if defined(__GNUC__) && defined(__i386__) + /* This works much better if compiled with "gcc -O3". */ + ulonglong result; + __asm__ __volatile__ ("rdtsc" : "=A" (result)); + return result; +#elif defined(__SUNPRO_C) && defined(__i386) + __asm("rdtsc"); +#elif defined(__GNUC__) && defined(__x86_64__) + ulonglong result; + __asm__ __volatile__ ("rdtsc\n\t" \ + "shlq $32,%%rdx\n\t" \ + "orq %%rdx,%%rax" + : "=a" (result) :: "%edx"); + return result; +#elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) + { + ulonglong result; + rdtscll(result); + return result; + } +#elif defined(_WIN32) && defined(_M_IX86) + __asm {rdtsc}; +#elif defined(_WIN64) && defined(_M_X64) + /* For 64-bit Windows: unsigned __int64 __rdtsc(); */ + return __rdtsc(); +#elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) + return (ulonglong) __getReg(_IA64_REG_AR_ITC); /* (3116) */ +#elif defined(__GNUC__) && defined(__ia64__) + { + ulonglong result; + __asm __volatile__ ("mov %0=ar.itc" : "=r" (result)); + return result; + } +#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64)) + { + ulonglong result; + __asm __volatile__ ("mftb %0" : "=r" (result)); + return result; + } +#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64)) + { + /* + mftbu means "move from time-buffer-upper to result". + The loop is saying: x1=upper, x2=lower, x3=upper, + if x1!=x3 there was an overflow so repeat. + */ + unsigned int x1, x2, x3; + ulonglong result; + for (;;) + { + __asm __volatile__ ( "mftbu %0" : "=r"(x1) ); + __asm __volatile__ ( "mftb %0" : "=r"(x2) ); + __asm __volatile__ ( "mftbu %0" : "=r"(x3) ); + if (x1 == x3) break; + } + result = x1; + return ( result << 32 ) | x2; + } +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) + return (my_timer_cycles_il_sparc64()); +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7) + return (my_timer_cycles_il_sparc32()); +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32) + /* This is probably redundant for __SUNPRO_C. */ + return (my_timer_cycles_il_i386()); +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64) + return (my_timer_cycles_il_x86_64()); +#elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2) + { + ulonglong result; + __asm __volatile__ ("rd %%tick,%0" : "=r" (result)); + return result; + } +#elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2) + { + union { + ulonglong wholeresult; + struct { + ulong high; + ulong low; + } splitresult; + } result; + __asm __volatile__ ("rd %%tick,%1; srlx %1,32,%0" : "=r" (result.splitresult.high), "=r" (result.splitresult.low)); + return result.wholeresult; + } +#elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) + { + struct timespec tp; + clock_gettime(CLOCK_SGI_CYCLE, &tp); + return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec; + } +#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) + /* gethrtime may appear as either cycle or nanosecond counter */ + return (ulonglong) gethrtime(); +#else + return 0; +#endif +} + +#if defined(__INTEL_COMPILER) +/* re-enable warning#1011 which was only for my_timer_cycles() */ +/* There may be an icc bug which means we must leave disabled. */ +#pragma warning (default:1011) +#endif + +/* + For nanoseconds, most platforms have nothing available that + (a) doesn't require bringing in a 40-kb librt.so library + (b) really has nanosecond resolution. +*/ + +ulonglong my_timer_nanoseconds(void) +{ +#if defined(HAVE_READ_REAL_TIME) + { + timebasestruct_t tr; + read_real_time(&tr, TIMEBASE_SZ); + return (ulonglong) tr.tb_high * 1000000000 + (ulonglong) tr.tb_low; + } +#elif defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_REALTIME) + { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return (ulonglong) tp.tv_sec * 1000000000 + (ulonglong) tp.tv_nsec; + } +#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) + /* SunOS 5.10+, Solaris, HP-UX: hrtime_t gethrtime(void) */ + return (ulonglong) gethrtime(); +#elif defined(__NETWARE__) + { + NXTime_t tm; + NXGetTime(NX_SINCE_1970, NX_NSECONDS, &tm); + return (ulonglong) tm; + } +#elif defined(__APPLE__) && defined(__MACH__) + { + ulonglong tm; + static mach_timebase_info_data_t timebase_info= {0,0}; + if (timebase_info.denom == 0) + (void) mach_timebase_info(&timebase_info); + tm= mach_absolute_time(); + return (tm * timebase_info.numer) / timebase_info.denom; + } +#else + return 0; +#endif +} + +/* + For microseconds, gettimeofday() is available on + almost all platforms. On Windows we use + QueryPerformanceCounter which will usually tick over + 3.5 million times per second, and we don't throw + away the extra precision. (On Windows Server 2003 + the frequency is same as the cycle frequency.) +*/ + +ulonglong my_timer_microseconds(void) +{ +#if defined(HAVE_GETTIMEOFDAY) + { + static ulonglong last_value= 0; + struct timeval tv; + if (gettimeofday(&tv, NULL) == 0) + last_value= (ulonglong) tv.tv_sec * 1000000 + (ulonglong) tv.tv_usec; + else + { + /* + There are reports that gettimeofday(2) can have intermittent failures + on some platform, see for example Bug#36819. + We are not trying again or looping, just returning the best value possible + under the circumstances ... + */ + last_value++; + } + return last_value; + } +#elif defined(_WIN32) + { + /* QueryPerformanceCounter usually works with about 1/3 microsecond. */ + LARGE_INTEGER t_cnt; + + QueryPerformanceCounter(&t_cnt); + return (ulonglong) t_cnt.QuadPart; + } +#elif defined(__NETWARE__) + { + NXTime_t tm; + NXGetTime(NX_SINCE_1970, NX_USECONDS, &tm); + return (ulonglong) tm; + } +#else + return 0; +#endif +} + +/* + For milliseconds, we use ftime() if it's supported + or time()*1000 if it's not. With modern versions of + Windows and with HP Itanium, resolution is 10-15 + milliseconds. +*/ + +ulonglong my_timer_milliseconds(void) +{ +#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) + /* ftime() is obsolete but maybe the platform is old */ + struct timeb ft; + ftime(&ft); + return (ulonglong)ft.time * 1000 + (ulonglong)ft.millitm; +#elif defined(HAVE_TIME) + return (ulonglong) time(NULL) * 1000; +#elif defined(_WIN32) + FILETIME ft; + GetSystemTimeAsFileTime( &ft ); + return ((ulonglong)ft.dwLowDateTime + + (((ulonglong)ft.dwHighDateTime) << 32))/10000; +#elif defined(__NETWARE__) + { + NXTime_t tm; + NXGetTime(NX_SINCE_1970, NX_MSECONDS, &tm); + return (ulonglong)tm; + } +#else + return 0; +#endif +} + +/* + For ticks, which we handle with times(), the frequency + is usually 100/second and the overhead is surprisingly + bad, sometimes even worse than gettimeofday's overhead. +*/ + +ulonglong my_timer_ticks(void) +{ +#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) + { + struct tms times_buf; + return (ulonglong) times(×_buf); + } +#elif defined(__NETWARE__) + { + NXTime_t tm; + NXGetTime(NX_SINCE_BOOT, NX_TICKS, &tm); + return (ulonglong) tm; + } +#elif defined(_WIN32) + return (ulonglong) GetTickCount(); +#else + return 0; +#endif +} + +/* + The my_timer_init() function and its sub-functions + have several loops which call timers. If there's + something wrong with a timer -- which has never + happened in tests -- we want the loop to end after + an arbitrary number of iterations, and my_timer_info + will show a discouraging result. The arbitrary + number is 1,000,000. +*/ +#define MY_TIMER_ITERATIONS 1000000 + +/* + Calculate overhead. Called from my_timer_init(). + Usually best_timer_overhead = cycles.overhead or + nanoseconds.overhead, so returned amount is in + cycles or nanoseconds. We repeat the calculation + ten times, so that we can disregard effects of + caching or interrupts. Result is quite consistent + for cycles, at least. But remember it's a minimum. +*/ + +static void my_timer_init_overhead(ulonglong *overhead, + ulonglong (*cycle_timer)(void), + ulonglong (*this_timer)(void), + ulonglong best_timer_overhead) +{ + ulonglong time1, time2; + int i; + + /* *overhead, least of 20 calculations - cycles.overhead */ + for (i= 0, *overhead= 1000000000; i < 20; ++i) + { + time1= cycle_timer(); + this_timer(); /* rather than 'time_tmp= timer();' */ + time2= cycle_timer() - time1; + if (*overhead > time2) + *overhead= time2; + } + *overhead-= best_timer_overhead; +} + +/* + Calculate Resolution. Called from my_timer_init(). + If a timer goes up by jumps, e.g. 1050, 1075, 1100, ... + then the best resolution is the minimum jump, e.g. 25. + If it's always divisible by 1000 then it's just a + result of multiplication of a lower-precision timer + result, e.g. nanoseconds are often microseconds * 1000. + If the minimum jump is less than an arbitrary passed + figure (a guess based on maximum overhead * 2), ignore. + Usually we end up with nanoseconds = 1 because it's too + hard to detect anything <= 100 nanoseconds. + Often GetTickCount() has resolution = 15. + We don't check with ticks because they take too long. +*/ +static ulonglong my_timer_init_resolution(ulonglong (*this_timer)(void), + ulonglong overhead_times_2) +{ + ulonglong time1, time2; + ulonglong best_jump; + int i, jumps, divisible_by_1000, divisible_by_1000000; + + divisible_by_1000= divisible_by_1000000= 0; + best_jump= 1000000; + for (i= jumps= 0; jumps < 3 && i < MY_TIMER_ITERATIONS * 10; ++i) + { + time1= this_timer(); + time2= this_timer(); + time2-= time1; + if (time2) + { + ++jumps; + if (!(time2 % 1000)) + { + ++divisible_by_1000; + if (!(time2 % 1000000)) + ++divisible_by_1000000; + } + if (best_jump > time2) + best_jump= time2; + /* For milliseconds, one jump is enough. */ + if (overhead_times_2 == 0) + break; + } + } + if (jumps == 3) + { + if (jumps == divisible_by_1000000) + return 1000000; + if (jumps == divisible_by_1000) + return 1000; + } + if (best_jump > overhead_times_2) + return best_jump; + return 1; +} + +/* + Calculate cycle frequency by seeing how many cycles pass + in a 200-microsecond period. I tried with 10-microsecond + periods originally, and the result was often very wrong. +*/ + +static ulonglong my_timer_init_frequency(MY_TIMER_INFO *mti) +{ + int i; + ulonglong time1, time2, time3, time4; + time1= my_timer_cycles(); + time2= my_timer_microseconds(); + time3= time2; /* Avoids a Microsoft/IBM compiler warning */ + for (i= 0; i < MY_TIMER_ITERATIONS; ++i) + { + time3= my_timer_microseconds(); + if (time3 - time2 > 200) break; + } + time4= my_timer_cycles() - mti->cycles.overhead; + time4-= mti->microseconds.overhead; + return (mti->microseconds.frequency * (time4 - time1)) / (time3 - time2); +} + +/* + Call my_timer_init before the first call to my_timer_xxx(). + If something must be initialized, it happens here. + Set: what routine is being used e.g. "asm_x86" + Set: function, overhead, actual frequency, resolution. +*/ + +void my_timer_init(MY_TIMER_INFO *mti) +{ + ulonglong (*best_timer)(void); + ulonglong best_timer_overhead; + ulonglong time1, time2; + int i; + + /* cycles */ + mti->cycles.frequency= 1000000000; +#if defined(__GNUC__) && defined(__i386__) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86; +#elif defined(__SUNPRO_C) && defined(__i386) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86; +#elif defined(__GNUC__) && defined(__x86_64__) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_64; +#elif defined(HAVE_ASM_MSR_H) && defined(HAVE_RDTSCLL) + mti->cycles.routine= MY_TIMER_ROUTINE_RDTSCLL; +#elif defined(_WIN32) && defined(_M_IX86) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_X86_WIN; +#elif defined(_WIN64) && defined(_M_X64) + mti->cycles.routine= MY_TIMER_ROUTINE_RDTSC; +#elif defined(__INTEL_COMPILER) && defined(__ia64__) && defined(HAVE_IA64INTRIN_H) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64; +#elif defined(__GNUC__) && defined(__ia64__) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_IA64; +#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (defined(__64BIT__) || defined(_ARCH_PPC64)) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC64; +#elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__) || (defined(_POWER) && defined(_AIX52))) && (!defined(__64BIT__) && !defined(_ARCH_PPC64)) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_PPC; +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv9) && defined(_LP64) && !defined(__SunOS_5_7) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC64; +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__sparcv8plus) && defined(_ILP32) && !defined(__SunOS_5_7) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_SPARC32; +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__i386) && defined(_ILP32) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_I386; +#elif (defined(__SUNPRO_CC) || defined(__SUNPRO_C)) && defined(__x86_64) && defined(_LP64) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_SUNPRO_X86_64; +#elif defined(__GNUC__) && defined(__sparcv9) && defined(_LP64) && (__GNUC__>2) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC64; +#elif defined(__GNUC__) && defined(__sparc__) && !defined(_LP64) && (__GNUC__>2) + mti->cycles.routine= MY_TIMER_ROUTINE_ASM_GCC_SPARC32; +#elif defined(__sgi) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) + mti->cycles.routine= MY_TIMER_ROUTINE_SGI_CYCLE; +#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) + mti->cycles.routine= MY_TIMER_ROUTINE_GETHRTIME; +#else + mti->cycles.routine= 0; +#endif + + if (!mti->cycles.routine || !my_timer_cycles()) + { + mti->cycles.routine= 0; + mti->cycles.resolution= 0; + mti->cycles.frequency= 0; + mti->cycles.overhead= 0; + } + + /* nanoseconds */ + mti->nanoseconds.frequency= 1000000000; /* initial assumption */ +#if defined(HAVE_READ_REAL_TIME) + mti->nanoseconds.routine= MY_TIMER_ROUTINE_READ_REAL_TIME; +#elif defined(HAVE_CLOCK_GETTIME) + mti->nanoseconds.routine= MY_TIMER_ROUTINE_CLOCK_GETTIME; +#elif defined(HAVE_SYS_TIMES_H) && defined(HAVE_GETHRTIME) + mti->nanoseconds.routine= MY_TIMER_ROUTINE_GETHRTIME; +#elif defined(__NETWARE__) + mti->nanoseconds.routine= MY_TIMER_ROUTINE_NXGETTIME; +#elif defined(__APPLE__) && defined(__MACH__) + mti->nanoseconds.routine= MY_TIMER_ROUTINE_MACH_ABSOLUTE_TIME; +#else + mti->nanoseconds.routine= 0; +#endif + if (!mti->nanoseconds.routine || !my_timer_nanoseconds()) + { + mti->nanoseconds.routine= 0; + mti->nanoseconds.resolution= 0; + mti->nanoseconds.frequency= 0; + mti->nanoseconds.overhead= 0; + } + + /* microseconds */ + mti->microseconds.frequency= 1000000; /* initial assumption */ +#if defined(HAVE_GETTIMEOFDAY) + mti->microseconds.routine= MY_TIMER_ROUTINE_GETTIMEOFDAY; +#elif defined(_WIN32) + { + LARGE_INTEGER li; + /* Windows: typical frequency = 3579545, actually 1/3 microsecond. */ + if (!QueryPerformanceFrequency(&li)) + mti->microseconds.routine= 0; + else + { + mti->microseconds.frequency= li.QuadPart; + mti->microseconds.routine= MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER; + } + } +#elif defined(__NETWARE__) + mti->microseconds.routine= MY_TIMER_ROUTINE_NXGETTIME; +#else + mti->microseconds.routine= 0; +#endif + if (!mti->microseconds.routine || !my_timer_microseconds()) + { + mti->microseconds.routine= 0; + mti->microseconds.resolution= 0; + mti->microseconds.frequency= 0; + mti->microseconds.overhead= 0; + } + + /* milliseconds */ + mti->milliseconds.frequency= 1000; /* initial assumption */ +#if defined(HAVE_SYS_TIMEB_H) && defined(HAVE_FTIME) + mti->milliseconds.routine= MY_TIMER_ROUTINE_FTIME; +#elif defined(_WIN32) + mti->milliseconds.routine= MY_TIMER_ROUTINE_GETSYSTEMTIMEASFILETIME; +#elif defined(__NETWARE__) + mti->milliseconds.routine= MY_TIMER_ROUTINE_NXGETTIME; +#elif defined(HAVE_TIME) + mti->milliseconds.routine= MY_TIMER_ROUTINE_TIME; +#else + mti->milliseconds.routine= 0; +#endif + if (!mti->milliseconds.routine || !my_timer_milliseconds()) + { + mti->milliseconds.routine= 0; + mti->milliseconds.resolution= 0; + mti->milliseconds.frequency= 0; + mti->milliseconds.overhead= 0; + } + + /* ticks */ + mti->ticks.frequency= 100; /* permanent assumption */ +#if defined(HAVE_SYS_TIMES_H) && defined(HAVE_TIMES) + mti->ticks.routine= MY_TIMER_ROUTINE_TIMES; +#elif defined(__NETWARE__) + mti->ticks.routine= MY_TIMER_ROUTINE_NXGETTIME; +#elif defined(_WIN32) + mti->ticks.routine= MY_TIMER_ROUTINE_GETTICKCOUNT; +#else + mti->ticks.routine= 0; +#endif + if (!mti->ticks.routine || !my_timer_ticks()) + { + mti->ticks.routine= 0; + mti->ticks.resolution= 0; + mti->ticks.frequency= 0; + mti->ticks.overhead= 0; + } + + /* + Calculate overhead in terms of the timer that + gives the best resolution: cycles or nanoseconds. + I doubt it ever will be as bad as microseconds. + */ + if (mti->cycles.routine) + best_timer= &my_timer_cycles; + else + { + if (mti->nanoseconds.routine) + { + best_timer= &my_timer_nanoseconds; + } + else + best_timer= &my_timer_microseconds; + } + + /* best_timer_overhead = least of 20 calculations */ + for (i= 0, best_timer_overhead= 1000000000; i < 20; ++i) + { + time1= best_timer(); + time2= best_timer() - time1; + if (best_timer_overhead > time2) + best_timer_overhead= time2; + } + if (mti->cycles.routine) + my_timer_init_overhead(&mti->cycles.overhead, + best_timer, + &my_timer_cycles, + best_timer_overhead); + if (mti->nanoseconds.routine) + my_timer_init_overhead(&mti->nanoseconds.overhead, + best_timer, + &my_timer_nanoseconds, + best_timer_overhead); + if (mti->microseconds.routine) + my_timer_init_overhead(&mti->microseconds.overhead, + best_timer, + &my_timer_microseconds, + best_timer_overhead); + if (mti->milliseconds.routine) + my_timer_init_overhead(&mti->milliseconds.overhead, + best_timer, + &my_timer_milliseconds, + best_timer_overhead); + if (mti->ticks.routine) + my_timer_init_overhead(&mti->ticks.overhead, + best_timer, + &my_timer_ticks, + best_timer_overhead); + +/* + Calculate resolution for nanoseconds or microseconds + or milliseconds, by seeing if it's always divisible + by 1000, and by noticing how much jumping occurs. + For ticks, just assume the resolution is 1. +*/ + if (mti->cycles.routine) + mti->cycles.resolution= 1; + if (mti->nanoseconds.routine) + mti->nanoseconds.resolution= + my_timer_init_resolution(&my_timer_nanoseconds, 20000); + if (mti->microseconds.routine) + mti->microseconds.resolution= + my_timer_init_resolution(&my_timer_microseconds, 20); + if (mti->milliseconds.routine) + { + if (mti->milliseconds.routine == MY_TIMER_ROUTINE_TIME) + mti->milliseconds.resolution= 1000; + else + mti->milliseconds.resolution= + my_timer_init_resolution(&my_timer_milliseconds, 0); + } + if (mti->ticks.routine) + mti->ticks.resolution= 1; + +/* + Calculate cycles frequency, + if we have both a cycles routine and a microseconds routine. + In tests, this usually results in a figure within 2% of + what "cat /proc/cpuinfo" says. + If the microseconds routine is QueryPerformanceCounter + (i.e. it's Windows), and the microseconds frequency is > + 500,000,000 (i.e. it's Windows Server so it uses RDTSC) + and the microseconds resolution is > 100 (i.e. dreadful), + then calculate cycles frequency = microseconds frequency. +*/ + if (mti->cycles.routine + && mti->microseconds.routine) + { + if (mti->microseconds.routine == + MY_TIMER_ROUTINE_QUERYPERFORMANCECOUNTER + && mti->microseconds.frequency > 500000000 + && mti->microseconds.resolution > 100) + mti->cycles.frequency= mti->microseconds.frequency; + else + { + ulonglong time1, time2; + time1= my_timer_init_frequency(mti); + /* Repeat once in case there was an interruption. */ + time2= my_timer_init_frequency(mti); + if (time1 < time2) mti->cycles.frequency= time1; + else mti->cycles.frequency= time2; + } + } + +/* + Calculate milliseconds frequency = + (cycles-frequency/#-of-cycles) * #-of-milliseconds, + if we have both a milliseconds routine and a cycles + routine. + This will be inaccurate if milliseconds resolution > 1. + This is probably only useful when testing new platforms. +*/ + if (mti->milliseconds.routine + && mti->milliseconds.resolution < 1000 + && mti->microseconds.routine + && mti->cycles.routine) + { + int i; + ulonglong time1, time2, time3, time4; + time1= my_timer_cycles(); + time2= my_timer_milliseconds(); + time3= time2; /* Avoids a Microsoft/IBM compiler warning */ + for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i) + { + time3= my_timer_milliseconds(); + if (time3 - time2 > 10) break; + } + time4= my_timer_cycles(); + mti->milliseconds.frequency= + (mti->cycles.frequency * (time3 - time2)) / (time4 - time1); + } + +/* + Calculate ticks.frequency = + (cycles-frequency/#-of-cycles * #-of-ticks, + if we have both a ticks routine and a cycles + routine, + This is probably only useful when testing new platforms. +*/ + if (mti->ticks.routine + && mti->microseconds.routine + && mti->cycles.routine) + { + int i; + ulonglong time1, time2, time3, time4; + time1= my_timer_cycles(); + time2= my_timer_ticks(); + time3= time2; /* Avoids a Microsoft/IBM compiler warning */ + for (i= 0; i < MY_TIMER_ITERATIONS * 1000; ++i) + { + time3= my_timer_ticks(); + if (time3 - time2 > 10) break; + } + time4= my_timer_cycles(); + mti->ticks.frequency= + (mti->cycles.frequency * (time3 - time2)) / (time4 - time1); + } +} + +/* + Additional Comments + ------------------- + + This is for timing, i.e. finding out how long a piece of code + takes. If you want time of day matching a wall clock, the + my_timer_xxx functions won't help you. + + The best timer is the one with highest frequency, lowest + overhead, and resolution=1. The my_timer_info() routine will tell + you at runtime which timer that is. Usually it will be + my_timer_cycles() but be aware that, although it's best, + it has possible flaws and dangers. Depending on platform: + - The frequency might change. We don't test for this. It + happens on laptops for power saving, and on blade servers + for avoiding overheating. + - The overhead that my_timer_init() returns is the minimum. + In fact it could be slightly greater because of caching or + because you call the routine by address, as recommended. + It could be hugely greater if there's an interrupt. + - The x86 cycle counter, RDTSC doesn't "serialize". That is, + if there is out-of-order execution, rdtsc might be processed + after an instruction that logically follows it. + (We could force serialization, but that would be slower.) + - It is possible to set a flag which renders RDTSC + inoperative. Somebody responsible for the kernel + of the operating system would have to make this + decision. For the platforms we've tested with, there's + no such problem. + - With a multi-processor arrangement, it's possible + to get the cycle count from one processor in + thread X, and the cycle count from another processor + in thread Y. They may not always be in synch. + - You can't depend on a cycle counter being available for + all platforms. On Alphas, the + cycle counter is only 32-bit, so it would overflow quickly, + so we don't bother with it. On platforms that we haven't + tested, there might be some if/endif combination that we + didn't expect, or some assembler routine that we didn't + supply. + + The recommended way to use the timer routines is: + 1. Somewhere near the beginning of the program, call + my_timer_init(). This should only be necessary once, + although you can call it again if you think that the + frequency has changed. + 2. Determine the best timer based on frequency, resolution, + overhead -- all things that my_timer_init() returns. + Preserve the address of the timer and the my_timer_into + results in an easily-accessible place. + 3. Instrument the code section that you're monitoring, thus: + time1= my_timer_xxx(); + Instrumented code; + time2= my_timer_xxx(); + elapsed_time= (time2 - time1) - overhead; + If the timer is always on, then overhead is always there, + so don't subtract it. + 4. Save the elapsed time, or add it to a totaller. + 5. When all timing processes are complete, transfer the + saved / totalled elapsed time to permanent storage. + Optionally you can convert cycles to microseconds at + this point. (Don't do so every time you calculate + elapsed_time! That would waste time and lose precision!) + For converting cycles to microseconds, use the frequency + that my_timer_init() returns. You'll also need to convert + if the my_timer_microseconds() function is the Windows + function QueryPerformanceCounter(), since that's sometimes + a counter with precision slightly better than microseconds. + + Since we recommend calls by function pointer, we supply + no inline functions. + + Some comments on the many candidate routines for timing ... + + clock() -- We don't use because it would overflow frequently. + + clock_gettime() -- Often we don't use this even when it exists. + In configure.in, we use AC_CHECK_FUNCS(clock_gettime). Not + AC_CHECK_LIB(rc,clock_gettime) + AC_CHECK_FUNCS(clock_gettime) + If we had the above lines in configure.in, we'd have to use + /usr/lib/librt.so or /usr/lib64/librt.so when linking, and + the size of librt.so is 40KB. In tests, clock_gettime often + had resolution = 1000. + + ftime() -- A "man ftime" says: "This function is obsolete. + Don't use it." On every platform that we tested, if ftime() + was available, then so was gettimeofday(), and gettimeofday() + overhead was always at least as good as ftime() overhead. + + gettimeofday() -- available on most platforms, though not + on Windows. There is a hardware timer (sometimes a Programmable + Interrupt Timer or "PIT") (sometimes a "HPET") used for + interrupt generation. When it interrupts (a "tick" or "jiffy", + typically 1 centisecond) it sets xtime. For gettimeofday, a + Linux kernel routine usually gets xtime and then gets rdtsc + to get elapsed nanoseconds since the last tick. On Red Hat + Enterprise Linux 3, there was once a bug which caused the + resolution to be 1000, i.e. one centisecond. We never check + for time-zone change. + + getnstimeofday() -- something to watch for in future Linux + + do_gettimeofday() -- exists on Linux but not for "userland" + + get_cycles() -- a multi-platform function, worth watching + in future Linux versions. But we found platform-specific + functions which were better documented in operating-system + manuals. And get_cycles() can fail or return a useless + 32-bit number. It might be available on some platforms, + such as arm, which we didn't test. Using + "include <linux/timex.h>" or "include <asm/timex.h>" + can lead to autoconf or compile errors, depending on system. + + rdtsc, __rdtsc, rdtscll: available for x86 with Linux BSD, + Solaris, Windows. See "possible flaws and dangers" comments. + + times(): what we use for ticks. Should just read the last + (xtime) tick count, therefore should be fast, but usually + isn't. + + GetTickCount(): we use this for my_timer_ticks() on + Windows. Actually it really is a tick counter, so resolution + >= 10 milliseconds unless you have a very old Windows version. + With Windows 95 or 98 or ME, timeGetTime() has better resolution than + GetTickCount (1ms rather than 55ms). But with Windows NT or XP or 2000, + they're both getting from a variable in the Process Environment Block + (PEB), and the variable is set by the programmable interrupt timer, so + the resolution is the same (usually 10-15 milliseconds). Also timeGetTime + is slower on old machines: + http://www.doumo.jp/aon-java/jsp/postgretips/tips.jsp?tips=74. + Also timeGetTime requires linking winmm.lib, + Therefore we use GetTickCount. + It will overflow every 49 days because the return is 32-bit. + There is also a GetTickCount64 but it requires Vista or Windows Server 2008. + (As for GetSystemTimeAsFileTime, its precision is spurious, it + just reads the tick variable like the other functions do. + However, we don't expect it to overflow every 49 days, so we + will prefer it for my_timer_milliseconds().) + + QueryPerformanceCounter() we use this for my_timer_microseconds() + on Windows. 1-PIT-tick (often 1/3-microsecond). Usually reads + the PIT so it's slow. On some Windows variants, uses RDTSC. + + GetLocalTime() this is available on Windows but we don't use it. + + getclock(): documented for Alpha, but not found during tests. + + mach_absolute_time() and UpTime() are recommended for Apple. + Inititally they weren't tried, because asm_ppc seems to do the job. + But now we use mach_absolute_time for nanoseconds. + + Any clock-based timer can be affected by NPT (ntpd program), + which means: + - full-second correction can occur for leap second + - tiny corrections can occcur approimately every 11 minutes + (but I think they only affect the RTC which isn't the PIT). + + We define "precision" as "frequency" and "high precision" is + "frequency better than 1 microsecond". We define "resolution" + as a synonym for "granularity". We define "accuracy" as + "closeness to the truth" as established by some authoritative + clock, but we can't measure accuracy. + + Do not expect any of our timers to be monotonic; we + won't guarantee that they return constantly-increasing + unique numbers. + + We tested with AIX, Solaris (x86 + Sparc), Linux (x86 + + Itanium), Windows, 64-bit Windows, QNX, FreeBSD, HPUX, + Irix, Mac. We didn't test with NetWare or SCO. + +*/ + |