/*
* Copyright © 2012 - 2017 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
*/
#include "ocl_math_common.h"
#include "ocl_float.h"
#include "ocl_relational.h"
#include "ocl_common.h"
#include "ocl_integer.h"
#include "ocl_convert.h"
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
extern constant int __ocl_math_fastpath_flag;
CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
PURE CONST float __gen_ocl_rsqrt(float x);
CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
PURE CONST float __gen_ocl_rcp(float x);
CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
/* native functions */
OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
OVERLOADABLE float native_log(float x) {
return native_log2(x) * 0.6931472002f;
}
OVERLOADABLE float native_log10(float x) {
return native_log2(x) * 0.3010299956f;
}
OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
OVERLOADABLE float native_tan(float x) {
return native_sin(x) / native_cos(x);
}
OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
OVERLOADABLE float native_divide(float x, float y) { return x/y; }
/* Fast path */
OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
return native_log(x + native_sqrt(x * x + 1));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
return 0.5f * native_log((1 + x) / (1 - x));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
return __gen_ocl_pow(x, 0.3333333333f);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
return native_cos(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
return __gen_ocl_cos(x * M_PI_F);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
return native_exp(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
return native_exp10(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
return __gen_ocl_pow(M_E_F, x) - 1;
}
OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
return x-y*__gen_ocl_rndz(x/y);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
return __gen_ocl_sqrt(x*x + y*y);
}
OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
return __gen_ocl_rndd(native_log2(x));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
return __gen_ocl_pow(2, n) * x;
}
OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
return native_log(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
return native_log2(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
return native_log10(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
return native_log(x + 1);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
return __gen_ocl_rndd(native_log2(x));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
return x-y*__gen_ocl_rnde(x/y);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
return __gen_ocl_pow(x, 1.f / n);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
return native_sin(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
}
OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
return __gen_ocl_sin(x * M_PI_F);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
return native_tan(x);
}
OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
float y = native_exp(-2 * x);
return (1 - y) / (1 + y);
}
/* Internal implement, high accuracy. */
OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
union { unsigned u; float f; } ux, uy;
ux.f = x;
uy.f = y;
ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
return ux.f;
}
OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
union { unsigned int i; float f; } u;
const float
ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
two25 = 3.355443200e+07, /* 0x4c000000 */
Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
Lg3 = 2.8571429849e-01, /* 3E924925 */
Lg4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float fsq, f, s, z, R, w, t1, t2, partial;
int k, ix, i, j;
u.f = x; ix = u.i;
k = 0;
k += (ix>>23) - 127;
ix &= 0x007fffff;
i = (ix + (0x95f64<<3)) & 0x800000;
u.i = ix | (i^0x3f800000); x = u.f;
k += (i>>23);
f = x - 1.0f;
fsq = f * f;
if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
R = fsq * (0.5f - 0.33333333333333333f * f);
return k * ln2_hi + k * ln2_lo + f - R;
}
s = f / (2.0f + f);
z = s * s;
i = ix - (0x6147a << 3);
w = z * z;
j = (0x6b851 << 3) - ix;
t1= w * mad(w, Lg4, Lg2);
t2= z * mad(w, Lg3, Lg1);
i |= j;
R = t2 + t1;
partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
}
OVERLOADABLE float __gen_ocl_internal_log(float x)
{
union { unsigned int i; float f; } u;
u.f = x;
int ix = u.i;
if (ix < 0 )
return NAN; /* log(-#) = NaN */
if (ix >= 0x7f800000)
return NAN;
return __gen_ocl_internal_log_valid(x);
}
OVERLOADABLE float __gen_ocl_internal_log10(float x)
{
union { float f; unsigned i; } u;
const float
ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */
log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */
log10_2lo = 7.9034151668e-07; /* 0x355427db */
float y, z;
int i, k, hx;
u.f = x; hx = u.i;
if (hx<0)
return NAN; /* log(-#) = NaN */
if (hx >= 0x7f800000)
return NAN;
k = (hx >> 23) - 127;
i = ((unsigned)k & 0x80000000) >> 31;
hx = (hx&0x007fffff) | ((0x7f-i) << 23);
y = (float)(k + i);
u.i = hx; x = u.f;
return y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
}
OVERLOADABLE float __gen_ocl_internal_log2(float x)
{
const float zero = 0.0,
invln2 = 0x1.715476p+0f;
int ix;
union { float f; int i; } u;
u.f = x; ix = u.i;
if (ix < 0)
return NAN; /** log(-#) = NaN */
if (ix >= 0x7f800000)
return NAN;
return invln2 * __gen_ocl_internal_log_valid(x);
}
float __gen_ocl_scalbnf (float x, int n){
/* copy from fdlibm */
float two25 = 3.355443200e+07, /* 0x4c000000 */
twom25 = 2.9802322388e-08, /* 0x33000000 */
huge = 1.0e+30,
tiny = 1.0e-30;
int k,ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
k = (ix&0x7f800000)>>23; /* extract exponent */
if (k==0) { /* 0 or subnormal x */
if ((ix&0x7fffffff)==0) return x; /* +-0 */
x *= two25;
GEN_OCL_GET_FLOAT_WORD(ix,x);
k = ((ix&0x7f800000)>>23) - 25;
}
if (k==0xff) return x+x; /* NaN or Inf */
if (n< -50000)
return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
if (n> 50000 || k+n > 0xfe)
return huge*__gen_ocl_internal_copysign(huge,x); /* overflow */
/* Now k and n are bounded we know that k = k+n does not overflow. */
k = k+n;
if (k > 0) { /* normal result */
GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
return x;
}
if (k <= -25)
return tiny*__gen_ocl_internal_copysign(tiny,x); /*underflow*/
k += 25; /* subnormal result */
GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
return x*twom25;
}
const __constant unsigned int two_over_pi[] = {
0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
};
// The main idea is from "Radian Reduction for Trigonometric Functions"
// written by Mary H. Payne and Robert N. Hanek. Also another reference
// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
// written by Roger Alan Smith, who gave the worst case in this paper.
// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
// leading zeros in the fraction part of x*(2.0/pi). so we need at least
// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
// about 19 guard bits. If we need further precision, we may need more
// guard bits
// Note we place two 0 in two_over_pi, which is used to handle input less
// than 0x1.0p23
int payne_hanek(float x, float *y) {
union { float f; unsigned u;} ieee;
ieee.f = x;
unsigned u = ieee.u;
int k = ((u & 0x7f800000) >> 23)-127;
int ma = (u & 0x7fffff) | 0x800000;
unsigned high, low;
high = (ma & 0xfff000) >> 12;
low = ma & 0xfff;
// Two tune below macro, you need to fully understand the algorithm
#define CALC_BLOCKS 7
#define ZERO_BITS 2
unsigned result[CALC_BLOCKS];
// round down, note we need 2 bits integer precision
int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
for (int i = 0; i < CALC_BLOCKS; i++) {
result[i] = low * two_over_pi[index+i+ZERO_BITS] ;
result[i] += high * two_over_pi[index+i+1+ZERO_BITS];
}
for (int i = CALC_BLOCKS-1; i > 0; i--) {
int temp = result[i] >> 12;
result[i] -= temp << 12;
result[i-1] += temp;
}
#undef CALC_BLOCKS
#undef ZERO_BITS
// get number of integer digits in result[0], note we only consider 12 valid bits
// and also it means the fraction digits in result[0] is (12-intDigit)
int intDigit = index*(-12) + (k-23);
// As the integer bits may be all included in result[0], and also maybe
// some bits in result[0], and some in result[1]. So we merge succesive bits,
// which makes easy coding.
unsigned b0 = (result[0] << 12) | result[1];
unsigned b1 = (result[2] << 12) | result[3];
unsigned b2 = (result[4] << 12) | result[5];
unsigned b3 = (result[6] << 12);
unsigned intPart = b0 >> (24-intDigit);
unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
// larger than 0.5? which mean larger than pi/4, we need
// transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
int largerPiBy4 = ((fract1 & 0x800000) != 0);
int sign = largerPiBy4 ? 1 : 0;
intPart = largerPiBy4 ? (intPart+1) : intPart;
fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
int leadingZero = (fract1 == 0);
// +1 is for the hidden bit 1 in floating-point format
int exponent = leadingZero ? -(24+1) : -(0+1);
fract1 = leadingZero ? fract2 : fract1;
fract2 = leadingZero ? fract3 : fract2;
// fract1 may have leading zeros, add it
int shift = clz(fract1)-8;
exponent += -shift;
float pio2 = 0x1.921fb6p+0;
unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
// we know that denormal number will not appear here
ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
*y = ieee.f * pio2;
return intPart;
}
int argumentReduceSmall(float x, float * remainder) {
float halfPi = 2.0f/3.14159265f;
// pi/2 = 0.C90FDAA22168C234C4p+1;
float halfPi_p1 = (float) 0xC908/0x1.0p15,
halfPi_p2 = (float) 0x7DAA/0x1.0p27,
halfPi_p3 = (float) 0x22168C/0x1.0p51,
halfPi_p4 = (float) 0x234C4/0x1.0p71;
uint iy = (uint)mad(halfPi, x, 0.5f);
float y = (float)iy;
float rem = mad(y, -halfPi_p1, x);
rem = mad(y, -halfPi_p2, rem);
rem = mad(y, -halfPi_p3, rem);
*remainder = rem;
return iy;
}
int __ieee754_rem_pio2f(float x, float *y) {
if (x < 2.5e2) {
return argumentReduceSmall(x, y);
} else {
return payne_hanek(x, y);
}
}
OVERLOADABLE float __kernel_sinf(float x)
{
/* copied from fdlibm */
const float
S1 = -1.6666667163e-01, /* 0xbe2aaaab */
S2 = 8.3333337680e-03, /* 0x3c088889 */
S3 = -1.9841270114e-04, /* 0xb9500d01 */
S4 = 2.7557314297e-06; /* 0x3638ef1b */
float z,r,v;
z = x*x;
v = z*x;
r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
return mad(v, r, x);
}
float __kernel_cosf(float x, float y)
{
/* copied from fdlibm */
const float
one = 1.0000000000e+00, /* 0x3f800000 */
C1 = 4.1666667908e-02, /* 0x3d2aaaab */
C2 = -1.3888889225e-03, /* 0xbab60b61 */
C3 = 2.4801587642e-05; /* 0x37d00d01 */
float a,hz,z,r,qx;
int ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff; /* ix = |x|'s high word*/
z = x*x;
r = z * mad(z, mad(z, C3, C2), C1);
if(ix < 0x3e99999a) /* if |x| < 0.3 */
return one - ((float)0.5*z - (z*r - x*y));
else {
GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
hz = (float)0.5*z-qx;
a = one-qx;
return a - (hz - (z*r-x*y));
}
}
OVERLOADABLE float sin(float x)
{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sin(x);
float y;
float na ;
uint n, ix;
float negative = x < 0.0f? -1.0f : 1.0f;
x = fabs(x);
/* cos(Inf or NaN) is NaN */
na = x -x;
uint n0, n1;
float v;
n = __ieee754_rem_pio2f(x,&y);
float s = __kernel_sinf(y);
float c = sqrt(mad(-s, s, 1.0f));
n0 = (n&0x1);
n1 = (n&0x2);
v = (n0)?c:s;
v = (n1)?-v:v;
/* n&3 return
0 sin(y)
1 cos(y)
2 -sin(y)
3 -cos(y)
*/
return mad(v, negative, na);
}
OVERLOADABLE float cos(float x)
{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cos(x);
float y;
float na ;
uint n, ix;
x = __gen_ocl_fabs(x);
/* cos(Inf or NaN) is NaN */
na = x -x;
uint n0, n1;
float v;
n = __ieee754_rem_pio2f(x,&y);
float s = __kernel_sinf(y);
float c = sqrt(fabs(mad(s, s, -1.0f)));
n0 = (n&0x1);
n1 = (n&0x2);
float ss = n1 - 1.0f;
v = (n0)?s:-c;
/* n&3 return
0 cos(y)
1 -sin(y)
2 -cos(y)
3 sin(y)
*/
return mad(v, ss, na);
}
float __kernel_tanf(float x, float y, int iy)
{
/* copied from fdlibm */
float z,r,v,w,s;
int ix,hx;
const float
one = 1.0000000000e+00, /* 0x3f800000 */
pio4 = 7.8539812565e-01, /* 0x3f490fda */
pio4lo= 3.7748947079e-08; /* 0x33222168 */
float T[13];// = {
T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
T[1] = 1.3333334029e-01; /* 0x3e088889 */
T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
T[4] = 8.8632395491e-03; /* 0x3c11371f */
T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
T[6] = 1.4562094584e-03; /* 0x3abede48 */
T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff; /* high word of |x| */
if(ix<0x31800000) /* x < 2**-28 */
{if((int)x==0) { /* generate inexact */
if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
else return (iy==1)? x: -one/x;
}
}
if(ix>=0x3f2ca140) { /* |x|>=0.6744 */
if(hx<0) {x = -x; y = -y;}
z = pio4-x;
w = pio4lo-y;
x = z+w; y = 0.0;
}
z = x*x;
w = z*z;
/* Break x^5*(T[1]+x^2*T[2]+...) into
* x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
* x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
*/
r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
v = z* mad(w, mad(w, T[6], T[4]), T[2]);
s = z*x;
r = mad(z, mad(s, r + v, y), y);
r += T[0]*s;
w = x+r;
if(ix>=0x3f2ca140) {
v = (float)iy;
return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
}
if(iy==1) return w;
else
return -1.0/(x+r);
}
/*Author : David Defour, Catherine Daramy, Florent de Dinechin, Christoph Lauter Contact :
David.Defour@ens-lyon.fr, catherine_daramy@ens-lyon.fr
This program is free software; you can redistribute it and/or modify it under the terms
of the GNU Lesser General Public License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version. This program is distributed
in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public License along with this program;
if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA*/
/* 23 */
/* tan(x) ~ x + T1*x + ... + T13*x */
float __kernel_tanf_fast(float x, float y, int iy)
{
float x2 = x*x;
float sum;
sum = mad(0.0000392783222196f, x2, 0.0000969153770711f);
sum = mad(sum, x2, 0.0002391291200183f);
sum = mad(sum, x2, 0.0005900274263695f);
sum = mad(sum, x2, 0.0014558343682438f);
sum = mad(sum, x2, 0.0035921279340982f);
sum = mad(sum, x2, 0.0088632358238101f);
sum = mad(sum, x2, 0.0218694880604744f);
sum = mad(sum, x2, 0.0539682544767857f);
sum = mad(sum, x2, 0.1333333402872086f);
sum = mad(sum, x2, 0.3333333432674408f);
sum = sum*x2;
sum = mad(sum, x, x);
if(iy == 1)
{
sum = 1.0f/sum;
sum = -sum;
}
return sum;
}
OVERLOADABLE float tan(float x)
{
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_tan(x);
float y,na=0.0;
int n, ix;
float negative = x < 0.0f? -1.0f : 1.0f;
x = fabs(x);
na = x -x;
n = __ieee754_rem_pio2f(x,&y);
return mad(negative , __kernel_tanf_fast(y, 0.0f, (n&1)), na);
}
OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
int ix;
if(isinf(x) || isnan(x)) { return NAN; }
if(x < 0.0f) { x = -x; }
GEN_OCL_GET_FLOAT_WORD(ix, x);
if(x> 0x1.0p24) return 1.0f;
float m = __gen_ocl_internal_floor(x);
ix = (int)m;
m = x-m;
if((ix&0x1) != 0) m+=1.0f;
ix = __gen_ocl_internal_floor(m*4.0f);
switch(ix) {
case 0:
return __kernel_cosf(m*M_PI_F, 0.0f);
case 1:
case 2:
return __kernel_sinf((0.5f-m)*M_PI_F);
case 3:
case 4:
return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
case 5:
case 6:
return __kernel_sinf((m-1.5f)*M_PI_F);
default:
return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
}
}
OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
float sign = 1.0f;
int ix;
if(isinf(x)) return NAN;
if(x < 0.0f) { x = -x; sign = -1.0f; }
GEN_OCL_GET_FLOAT_WORD(ix, x);
if(x> 0x1.0p24) return 0.0f;
float m = __gen_ocl_internal_floor(x);
ix = (int)m;
m = x-m;
if((ix&0x1) != 0) m+=1.0f;
ix = __gen_ocl_internal_floor(m*4.0f);
switch(ix) {
case 0:
return sign*__kernel_sinf(m*M_PI_F);
case 1:
case 2:
return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
case 3:
case 4:
return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
case 5:
case 6:
return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
default:
return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
}
}
OVERLOADABLE float lgamma(float x) {
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
const float
zero= 0.,
one = 1.0000000000e+00,
pi = 3.1415927410e+00,
a0 = 7.7215664089e-02,
a1 = 3.2246702909e-01,
a2 = 6.7352302372e-02,
a3 = 2.0580807701e-02,
a4 = 7.3855509982e-03,
a5 = 2.8905137442e-03,
a6 = 1.1927076848e-03,
a7 = 5.1006977446e-04,
a8 = 2.2086278477e-04,
a9 = 1.0801156895e-04,
a10 = 2.5214456400e-05,
a11 = 4.4864096708e-05,
tc = 1.4616321325e+00,
tf = -1.2148628384e-01,
tt = 6.6971006518e-09,
t0 = 4.8383611441e-01,
t1 = -1.4758771658e-01,
t2 = 6.4624942839e-02,
t3 = -3.2788541168e-02,
t4 = 1.7970675603e-02,
t5 = -1.0314224288e-02,
t6 = 6.1005386524e-03,
t7 = -3.6845202558e-03,
t8 = 2.2596477065e-03,
t9 = -1.4034647029e-03,
t10 = 8.8108185446e-04,
t11 = -5.3859531181e-04,
t12 = 3.1563205994e-04,
t13 = -3.1275415677e-04,
t14 = 3.3552918467e-04,
u0 = -7.7215664089e-02,
u1 = 6.3282704353e-01,
u2 = 1.4549225569e+00,
u3 = 9.7771751881e-01,
u4 = 2.2896373272e-01,
u5 = 1.3381091878e-02,
v1 = 2.4559779167e+00,
v2 = 2.1284897327e+00,
v3 = 7.6928514242e-01,
v4 = 1.0422264785e-01,
v5 = 3.2170924824e-03,
s0 = -7.7215664089e-02,
s1 = 2.1498242021e-01,
s2 = 3.2577878237e-01,
s3 = 1.4635047317e-01,
s4 = 2.6642270386e-02,
s5 = 1.8402845599e-03,
s6 = 3.1947532989e-05,
r1 = 1.3920053244e+00,
r2 = 7.2193557024e-01,
r3 = 1.7193385959e-01,
r4 = 1.8645919859e-02,
r5 = 7.7794247773e-04,
r6 = 7.3266842264e-06,
w0 = 4.1893854737e-01,
w1 = 8.3333335817e-02,
w2 = -2.7777778450e-03,
w3 = 7.9365057172e-04,
w4 = -5.9518753551e-04,
w5 = 8.3633989561e-04,
w6 = -1.6309292987e-03;
float t, y, z, nadj, p, p1, p2, p3, q, r, w;
int i, hx, ix;
nadj = 0;
hx = *(int *)&x;
ix = hx & 0x7fffffff;
if (ix >= 0x7f800000)
return x * x;
if (ix == 0)
return ((x + one) / zero);
if (ix < 0x1c800000) {
if (hx < 0) {
return -native_log(-x);
} else
return -native_log(x);
}
if (hx < 0) {
if (ix >= 0x4b000000)
return ((-x) / zero);
t = __gen_ocl_internal_sinpi(x);
if (t == zero)
return ((-x) / zero);
nadj = native_log(pi / __gen_ocl_fabs(t * x));
x = -x;
}
if (ix == 0x3f800000 || ix == 0x40000000)
r = 0;
else if (ix < 0x40000000) {
if (ix <= 0x3f666666) {
r = -native_log(x);
if (ix >= 0x3f3b4a20) {
y = one - x;
i = 0;
} else if (ix >= 0x3e6d3308) {
y = x - (tc - one);
i = 1;
} else {
y = x;
i = 2;
}
} else {
r = zero;
if (ix >= 0x3fdda618) {
y = (float) 2.0 - x;
i = 0;
}
else if (ix >= 0x3F9da620) {
y = x - tc;
i = 1;
}
else {
y = x - one;
i = 2;
}
}
switch (i) {
case 0:
z = y * y;
p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
p = mad(y, p1, p2);
r += (p - (float) 0.5 * y);
break;
case 1:
z = y * y;
w = z * y;
p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
r += (tf + p);
break;
case 2:
p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
r += (-(float) 0.5 * y + p1 / p2);
}
} else if (ix < 0x41000000) {
i = (int) x;
t = zero;
y = x - (float) i;
p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
r = .5f * y + p / q;
z = one;
switch (i) {
case 7:
z *= (y + 6.0f);
case 6:
z *= (y + 5.0f);
case 5:
z *= (y + 4.0f);
case 4:
z *= (y + 3.0f);
case 3:
z *= (y + 2.0f);
r += native_log(z);
break;
}
} else if (ix < 0x5c800000) {
t = native_log(x);
z = one / x;
y = z * z;
w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
r = (x - .5f) * (t - one) + w;
} else
r = x * (native_log(x) - one);
if (hx < 0)
r = nadj - r;
return r;
}
OVERLOADABLE float log1p(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log1p(x);
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
const float
ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
two25 = 3.355443200e+07, /* 0x4c000000 */
Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
Lp3 = 2.8571429849e-01, /* 3E924925 */
Lp4 = 2.2222198546e-01; /* 3E638E29 */
const float zero = 0.0;
float hfsq,f,c,s,z,R,u;
int k,hx,hu,ax;
union {float f; unsigned i;} un;
un.f = x; hx = un.i;
ax = hx&0x7fffffff;
k = 1;
if (hx < 0x3ed413d7) { /* x < 0.41422 */
if(ax>=0x3f800000) { /* x <= -1.0 */
if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
else return (x-x)/(x-x); /* log1p(x<-1)=NaN */
}
if(ax<0x31000000) { /* |x| < 2**-29 */
if(two25+x>zero /* raise inexact */
&&ax<0x24800000) /* |x| < 2**-54 */
return x;
else
return x - x*x*(float)0.5;
}
if(hx>0||hx<=((int)0xbe95f61f)) {
k=0;f=x;hu=1;} /* -0.2929= 0x7f800000) return x+x;
if(k!=0) {
if(hx<0x5a000000) {
u = (float)1.0+x;
un.f = u; hu = un.i;
k = (hu>>23)-127;
/* correction term */
c = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
c /= u;
} else {
u = x;
un.f = u; hu = un.i;
k = (hu>>23)-127;
c = 0;
}
hu &= 0x007fffff;
if(hu<0x3504f7) {
un.i = hu|0x3f800000; u = un.f;/* normalize u */
} else {
k += 1;
un.i = hu|0x3f000000; u = un.f; /* normalize u/2 */
hu = (0x00800000-hu)>>2;
}
f = u-(float)1.0;
}
hfsq=(float)0.5*f*f;
if(hu==0)
{ /* |f| < 2**-20 */
if(f==zero)
{
if(k==0) return zero;
else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
}
R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
if(k==0) return f-R; else
return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
}
s = f/((float)2.0+f);
z = s*s;
R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
if(k==0)
return f + mad(hfsq + R, s, -hfsq);
else
return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
}
OVERLOADABLE float logb(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_logb(x);
union {float f; unsigned i;} u;
u.f = x;
int e = ((u.i & 0x7f800000) >> 23);
float r1 = e-127;
float r2 = -INFINITY;
float r3 = x*x;
/* sub normal or +/-0 */
float r = e == 0 ? r2 : r1;
/* inf & nan */
return e == 0xff ? r3 : r;
}
OVERLOADABLE int ilogb(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_ilogb(x);
union { int i; float f; } u;
if (isnan(x))
return FP_ILOGBNAN;
if (isinf(x))
return 0x7FFFFFFF;
u.f = x;
u.i &= 0x7fffffff;
if (u.i == 0)
return FP_ILOGB0;
if (u.i >= 0x800000)
return (u.i >> 23) - 127;
int r = -126;
int a = u.i & 0x7FFFFF;
while(a < 0x800000) {
a <<= 1;
r --;
}
return r;
}
OVERLOADABLE float nan(uint code) {
return NAN;
}
OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
float sign = 1.0f;
int ix;
if(isinf(x)) return NAN;
if(x < 0.0f) { x = -x; sign = -1.0f; }
GEN_OCL_GET_FLOAT_WORD(ix, x);
if(x> 0x1.0p24) return 0.0f;
float m = __gen_ocl_internal_floor(x);
ix = (int)m;
m = x-m;
int n = __gen_ocl_internal_floor(m*4.0f);
if(m == 0.5f) {
return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
}
if(m == 0.0f) {
return (ix&0x1) == 0 ? 0.0f : -0.0f;
}
switch(n) {
case 0:
return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
case 1:
return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
case 2:
return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
default:
return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
}
}
OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
/* copied from fdlibm */
const unsigned
B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
const float
C = 5.4285717010e-01, /* 19/35 = 0x3f0af8b0 */
D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
E = 1.4142856598e+00, /* 99/70 = 0x3fb50750 */
F = 1.6071428061e+00, /* 45/28 = 0x3fcdb6db */
G = 3.5714286566e-01; /* 5/14 = 0x3eb6db6e */
float r,s,t, w;
int hx;
uint sign;
uint high;
GEN_OCL_GET_FLOAT_WORD(hx,x);
sign=hx&0x80000000; /* sign= sign(x) */
hx ^=sign;
if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
if(hx==0)
return(x); /* cbrt(0) is itself */
GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
/* rough cbrt to 5 bits */
if(hx<0x00800000) /* subnormal number */
{
//SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
//t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
t = (sign = 0) ? 0.0f : -0.0f;
return t;
}
else
GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
/* new cbrt to 23 bits */
r=t*t/x;
s=mad(r, t, C);
t*=G+F/(s+E+D/s);
/* one step newton iteration to 53 bits with error less than 0.667 ulps */
s=t*t; /* t*t is exact */
r=x/s;
w=t+t;
r=(r-t)/(w+r); /* r-s is exact */
t=mad(t, r, t);
/* retore the sign bit */
GEN_OCL_GET_FLOAT_WORD(high,t);
GEN_OCL_SET_FLOAT_WORD(t,high|sign);
return(t);
}
INLINE float __gen_ocl_asin_util(float x) {
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
float
pS0 = 1.66666666666666657415e-01,
pS1 = -3.25565818622400915405e-01,
pS2 = 2.01212532134862925881e-01,
pS3 = -4.00555345006794114027e-02,
pS4 = 7.91534994289814532176e-04,
qS1 = -2.40339491173441421878e+00,
qS2 = 2.02094576023350569471e+00,
qS3 = -6.88283971605453293030e-01,
qS4 = 7.70381505559019352791e-02;
float t = x*x;
float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
float w = p / q;
return mad(x, w, x);
}
OVERLOADABLE float __gen_ocl_internal_asin(float x) {
float asinX2 =__gen_ocl_asin_util(x);
float absX = fabs(x);
float asinX1 = mad(2.0f , __gen_ocl_asin_util(native_sqrt(mad(-0.5f, absX, 0.5f))) , -M_PI_2_F);
float retVal = (x < 0.0f)?asinX1:-asinX1;
retVal = (absX > 0.5f)?retVal:asinX2;
retVal = (absX > 1.0f)?NAN:retVal;
return retVal;
}
OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
return __gen_ocl_internal_asin(x) / M_PI_F;
}
OVERLOADABLE float __gen_ocl_internal_acos(float x) {
float absX = fabs(x);
float asinX2 =__gen_ocl_asin_util(x);
float tmp = __gen_ocl_asin_util(native_sqrt(mad(-0.5f, absX, 0.5f)));
float asinX1 = mad(2.0f ,tmp, -M_PI_2_F);
float retVal = (x < 0.0f)?asinX1:-asinX1;
retVal = (absX > 0.5f)?retVal:asinX2;
retVal = (x <= 0.5f) ? M_PI_2_F - retVal:2.0f*tmp;
return retVal;
}
OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
return __gen_ocl_internal_acos(x) / M_PI_F;
}
__constant float atanhi[4] = {
4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
};
__constant float atanlo[4] = {
5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
};
OVERLOADABLE float __gen_ocl_internal_atan(float x) {
/* copied from fdlibm */
float aT[11];
aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
aT[1] = -2.0000000298e-01; /* 0xbe4ccccd */
aT[2] = 1.4285714924e-01; /* 0x3e124925 */
aT[3] = -1.1111110449e-01; /* 0xbde38e38 */
aT[4] = 9.0908870101e-02; /* 0x3dba2e6e */
aT[5] = -7.6918758452e-02; /* 0xbd9d8795 */
aT[6] = 6.6610731184e-02; /* 0x3d886b35 */
const float one = 1.0f, huge = 1.0e30;
float w,s1,s2,z;
int ix,hx;
float extraVal = 0.0f;
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
if (ix >= 0x3ee00000)
{
x = __gen_ocl_fabs(x);
if (ix < 0x3f980000)
{ /* |x| < 1.1875 */
if (ix < 0x3f300000)
{ /* 7/16 <=|x|<11/16 */
extraVal = 0.4636476040f;
x = mad(2.0f, x, -1.0f)/(2.0f+x);
}
else
{ /* 11/16<=|x|< 19/16 */
extraVal = 0.7853981853f;
x = (x-one)/(x+one);
}
}
else
{
if (ix < 0x401c0000)
{ /* |x| < 2.4375 */
extraVal = 0.9827937484f;
x = (x-1.5f)/mad(1.5f, x, one);
}
else
{ /* 2.4375 <= |x| < 2^66 */
extraVal = 1.5707963705f;
x = -1.0f/x;
}
}
}
/* end of argument reduction */
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
float retVal = mad(x, (-s1-s2), extraVal + x);
float retVal1 = (hx<0)? -retVal:retVal;
return (extraVal == 0.0) ? retVal:retVal1;
}
OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
return __gen_ocl_internal_atan(x) / M_PI_F;
}
// XXX work-around PTX profile
OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
/*
* Copyright (c) 2014 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
const float pi = 0x1.921fb6p+1f;
const float piby2 = 0x1.921fb6p+0f;
const float piby4 = 0x1.921fb6p-1f;
const float threepiby4 = 0x1.2d97c8p+1f;
float ax = fabs(x);
float ay = fabs(y);
float v = min(ax, ay);
float u = max(ax, ay);
// Scale since u could be large, as in "regular" divide
float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
float vbyu = s * v/ (s*u);
float vbyu2 = vbyu * vbyu;
float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
// Octant 0 result
float a = mad(p, 1.0f/q, vbyu);
// Fix up 3 other octants
float at = piby2 - a;
a = ay > ax ? at : a;
at = pi - a;
a = x < 0.0F ? at : a;
// y == 0 => 0 for x >= 0, pi for x < 0
int hx = as_int(x);
at = (hx < 0) ? pi : 0.0f;
a = y == 0.0f ? at : a;
at = x > 0.0f ? piby4 : threepiby4;
a = ax == INFINITY & ay == INFINITY ? at : a;
// x or y is NaN
a = isnan(x) | isnan(y) ? NAN : a;
// Fixup sign and return
return copysign(a, y);
}
OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
return __gen_ocl_internal_atan2(y, x) / M_PI_F;
}
OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
OVERLOADABLE float __gen_ocl_internal_round(float x) {
float y = __gen_ocl_rndz(x);
if (__gen_ocl_fabs(x - y) >= 0.5f)
y += __gen_ocl_internal_copysign(1.f, x);
return y;
}
OVERLOADABLE float __gen_ocl_internal_ceil(float x) { return __gen_ocl_rndu(x); }
OVERLOADABLE float __gen_ocl_internal_rint(float x) {
return __gen_ocl_rnde(x);
}
OVERLOADABLE float __gen_ocl_internal_exp(float x) {
float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
one = 1.0,
huge = 1.0e+30,
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03; /* 0xbb360b61 */
float y,hi=0.0,lo=0.0,c,t;
int k=0,xsb;
unsigned hx;
float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
float half_0 = 0.5;
float half_1 = -0.5;
GEN_OCL_GET_FLOAT_WORD(hx,x);
xsb = (hx>>31)&1; /* sign bit of x */
hx &= 0x7fffffff; /* high word of |x| */
/* filter out non-finite argument */
if(hx >= 0x42b17218) { /* if |x|>=88.721... */
if(hx>0x7f800000)
return x+x; /* NaN */
if(hx==0x7f800000)
return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */
if(x > o_threshold) return huge*huge; /* overflow */
if(x < u_threshold) return twom100*twom100; /* underflow */
}
/* argument reduction */
if(hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
if(hx < 0x3F851592) { /* and |x| < 1.5 ln2 */
hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
} else {
float tmp = xsb == 1 ? half_1 : half_0;
k = ivln2*x+tmp;
t = k;
hi = x - t*ln2HI_0; /* t*ln2HI is exact here */
lo = t*ln2LO_0;
}
x = hi - lo;
}
else if(hx < 0x31800000) { /* when |x|<2**-28 */
if(huge+x>one) return one+x;/* trigger inexact */
}
else k = 0;
/* x is now in primary range */
t = x*x;
c = x - t*(P1+t*P2);
if(k==0)
return one-((x*c)/(c-(float)2.0)-x);
else
y = one-((lo-(x*c)/((float)2.0-c))-hi);
if(k >= -125) {
unsigned hy;
GEN_OCL_GET_FLOAT_WORD(hy,y);
GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23)); /* add k to y's exponent */
return y;
} else {
unsigned hy;
GEN_OCL_GET_FLOAT_WORD(hy,y);
GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
return y*twom100;
}
}
OVERLOADABLE float __gen_ocl_internal_simple_exp(float x) {
float o_threshold = 8.8721679688e+01, /* 0x42b17180 */
u_threshold = -1.0397208405e+02, /* 0xc2cff1b5 */
twom100 = 7.8886090522e-31, /* 2**-100=0x0d800000 */
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
one = 1.0, P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03; /* 0xbb360b61 */
float y, hi = 0.0, lo = 0.0, c, t;
int k = 0;
unsigned hx;
float ln2HI_0 = 6.9313812256e-01; /* 0x3f317180 */
float ln2HI_1 = -6.9313812256e-01; /* 0xbf317180 */
float ln2LO_0 = 9.0580006145e-06; /* 0x3717f7d1 */
float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
float half_0 = 0.5;
float half_1 = -0.5;
float retVal = -1.0f;
hx = as_uint(fabs(x));
/* filter out non-finite argument */
/* if |x|>=88.721... */
if (hx >= 0x42b17218) {
float tmp = (x > 0) ? x : 0.0;
retVal = (x > 0) ? INFINITY : retVal; /* overflow */
retVal = (hx > 0x7f800000) ? NAN : retVal;
retVal = (hx == 0x7f800000) ? tmp : retVal;
retVal = (x < u_threshold) ? 0.0 : retVal; /* underflow */
if (retVal != -1.0f)
return retVal;
}
/* argument reduction */
float tmp = (x < 0) ? half_1 : half_0;
k = mad(ivln2, x, tmp);
t = k;
hi = mad(t, -ln2HI_0, x); /* t*ln2HI is exact here */
lo = t * ln2LO_0;
x = hi - lo;
/* x is now in primary range */
t = x * x;
c = mad(t, mad(t, -P2, -P1), x);
y = one - ((lo + (x * c) / (c - 2.0f)) - hi);
unsigned hy;
GEN_OCL_GET_FLOAT_WORD(hy, y);
float factor = (k >= -125) ? 1.0f : twom100;
k = (k >= -125) ? k : k + 100;
GEN_OCL_SET_FLOAT_WORD(y, hy + (k << 23)); /* add k to y's exponent */
return y * factor;
}
/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
/*...*/
const float
tiny = 1.0e-30,
half_val= 5.0000000000e-01, /* 0x3F000000 */
one = 1.0000000000e+00, /* 0x3F800000 */
two = 2.0000000000e+00, /* 0x40000000 */
/* c = (subfloat)0.84506291151 */
erx = 8.4506291151e-01, /* 0x3f58560b */
/*
* Coefficients for approximation to erf on [0,0.84375]
*/
efx = 1.2837916613e-01, /* 0x3e0375d4 */
efx8= 1.0270333290e+00, /* 0x3f8375d4 */
pp0 = 1.2837916613e-01, /* 0x3e0375d4 */
pp1 = -3.2504209876e-01, /* 0xbea66beb */
pp2 = -2.8481749818e-02, /* 0xbce9528f */
pp3 = -5.7702702470e-03, /* 0xbbbd1489 */
pp4 = -2.3763017452e-05, /* 0xb7c756b1 */
qq1 = 3.9791721106e-01, /* 0x3ecbbbce */
qq2 = 6.5022252500e-02, /* 0x3d852a63 */
qq3 = 5.0813062117e-03, /* 0x3ba68116 */
qq4 = 1.3249473704e-04, /* 0x390aee49 */
qq5 = -3.9602282413e-06, /* 0xb684e21a */
/*
* Coefficients for approximation to erf in [0.84375,1.25]
*/
pa0 = -2.3621185683e-03, /* 0xbb1acdc6 */
pa1 = 4.1485610604e-01, /* 0x3ed46805 */
pa2 = -3.7220788002e-01, /* 0xbebe9208 */
pa3 = 3.1834661961e-01, /* 0x3ea2fe54 */
pa4 = -1.1089469492e-01, /* 0xbde31cc2 */
pa5 = 3.5478305072e-02, /* 0x3d1151b3 */
pa6 = -2.1663755178e-03, /* 0xbb0df9c0 */
qa1 = 1.0642088205e-01, /* 0x3dd9f331 */
qa2 = 5.4039794207e-01, /* 0x3f0a5785 */
qa3 = 7.1828655899e-02, /* 0x3d931ae7 */
qa4 = 1.2617121637e-01, /* 0x3e013307 */
qa5 = 1.3637083583e-02, /* 0x3c5f6e13 */
qa6 = 1.1984500103e-02, /* 0x3c445aa3 */
/*
* Coefficients for approximation to erfc in [1.25,1/0.35]
*/ra0 = -9.8649440333e-03, /* 0xbc21a093 */
ra1 = -6.9385856390e-01, /* 0xbf31a0b7 */
ra2 = -1.0558626175e+01, /* 0xc128f022 */
ra3 = -6.2375331879e+01, /* 0xc2798057 */
ra4 = -1.6239666748e+02, /* 0xc322658c */
ra5 = -1.8460508728e+02, /* 0xc3389ae7 */
ra6 = -8.1287437439e+01, /* 0xc2a2932b */
ra7 = -9.8143291473e+00, /* 0xc11d077e */
sa1 = 1.9651271820e+01, /* 0x419d35ce */
sa2 = 1.3765776062e+02, /* 0x4309a863 */
sa3 = 4.3456588745e+02, /* 0x43d9486f */
sa4 = 6.4538726807e+02, /* 0x442158c9 */
sa5 = 4.2900814819e+02, /* 0x43d6810b */
sa6 = 1.0863500214e+02, /* 0x42d9451f */
sa7 = 6.5702495575e+00, /* 0x40d23f7c */
sa8 = -6.0424413532e-02, /* 0xbd777f97 */
/*
* Coefficients for approximation to erfc in [1/.35,28]
*/
rb0 = -9.8649431020e-03, /* 0xbc21a092 */
rb1 = -7.9928326607e-01, /* 0xbf4c9dd4 */
rb2 = -1.7757955551e+01, /* 0xc18e104b */
rb3 = -1.6063638306e+02, /* 0xc320a2ea */
rb4 = -6.3756646729e+02, /* 0xc41f6441 */
rb5 = -1.0250950928e+03, /* 0xc480230b */
rb6 = -4.8351919556e+02, /* 0xc3f1c275 */
sb1 = 3.0338060379e+01, /* 0x41f2b459 */
sb2 = 3.2579251099e+02, /* 0x43a2e571 */
sb3 = 1.5367296143e+03, /* 0x44c01759 */
sb4 = 3.1998581543e+03, /* 0x4547fdbb */
sb5 = 2.5530502930e+03, /* 0x451f90ce */
sb6 = 4.7452853394e+02, /* 0x43ed43a7 */
sb7 = -2.2440952301e+01; /* 0xc1b38712 */
int hx,ix,i;
float R,S,P,Q,s,y,z,r;
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
if(ix>=0x7f800000) { /* erf(nan)=nan */
i = ((unsigned int)hx>>31)<<1;
return (float)(1-i)+one/x; /* erf(+-inf)=+-1 */
}
if(ix < 0x3f580000) { /* |x|<0.84375 */
if(ix < 0x31800000) { /* |x|<2**-28 */
if (ix < 0x04000000)
/*avoid underflow */
return (float)0.125*((float)8.0*x+efx8*x);
return x + efx*x;
}
z = x*x;
r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
y = r / s;
return mad(x, y, x);
}
if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
s = __gen_ocl_internal_fabs(x)-one;
P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
if(hx>=0) return erx + P/Q; else return -erx - P/Q;
}
if (ix >= 0x40c00000) { /* inf>|x|>=6 */
if(hx>=0) return one-tiny; else return tiny-one;
}
x = __gen_ocl_internal_fabs(x);
s = one/(x*x);
if(ix< 0x4036DB6E) { /* |x| < 1/0.35 */
R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
} else { /* |x| >= 1/0.35 */
R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
rb6, rb5), rb4), rb3), rb2), rb1), rb0);
S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
}
GEN_OCL_GET_FLOAT_WORD(ix,x);
GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
r = __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
if(hx>=0) return one-r/x; else return r/x-one;
}
INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
/*...*/
const float
tiny = 1.0e-30,
half_val= 5.0000000000e-01, /* 0x3F000000 */
one = 1.0000000000e+00, /* 0x3F800000 */
two = 2.0000000000e+00, /* 0x40000000 */
/* c = (subfloat)0.84506291151 */
erx = 8.4506291151e-01, /* 0x3f58560b */
/*
* Coefficients for approximation to erf on [0,0.84375]
*/
efx = 1.2837916613e-01, /* 0x3e0375d4 */
efx8= 1.0270333290e+00, /* 0x3f8375d4 */
pp0 = 1.2837916613e-01, /* 0x3e0375d4 */
pp1 = -3.2504209876e-01, /* 0xbea66beb */
pp2 = -2.8481749818e-02, /* 0xbce9528f */
pp3 = -5.7702702470e-03, /* 0xbbbd1489 */
pp4 = -2.3763017452e-05, /* 0xb7c756b1 */
qq1 = 3.9791721106e-01, /* 0x3ecbbbce */
qq2 = 6.5022252500e-02, /* 0x3d852a63 */
qq3 = 5.0813062117e-03, /* 0x3ba68116 */
qq4 = 1.3249473704e-04, /* 0x390aee49 */
qq5 = -3.9602282413e-06, /* 0xb684e21a */
/*
* Coefficients for approximation to erf in [0.84375,1.25]
*/
pa0 = -2.3621185683e-03, /* 0xbb1acdc6 */
pa1 = 4.1485610604e-01, /* 0x3ed46805 */
pa2 = -3.7220788002e-01, /* 0xbebe9208 */
pa3 = 3.1834661961e-01, /* 0x3ea2fe54 */
pa4 = -1.1089469492e-01, /* 0xbde31cc2 */
pa5 = 3.5478305072e-02, /* 0x3d1151b3 */
pa6 = -2.1663755178e-03, /* 0xbb0df9c0 */
qa1 = 1.0642088205e-01, /* 0x3dd9f331 */
qa2 = 5.4039794207e-01, /* 0x3f0a5785 */
qa3 = 7.1828655899e-02, /* 0x3d931ae7 */
qa4 = 1.2617121637e-01, /* 0x3e013307 */
qa5 = 1.3637083583e-02, /* 0x3c5f6e13 */
qa6 = 1.1984500103e-02, /* 0x3c445aa3 */
/*
* Coefficients for approximation to erfc in [1.25,1/0.35]
*/ra0 = -9.8649440333e-03, /* 0xbc21a093 */
ra1 = -6.9385856390e-01, /* 0xbf31a0b7 */
ra2 = -1.0558626175e+01, /* 0xc128f022 */
ra3 = -6.2375331879e+01, /* 0xc2798057 */
ra4 = -1.6239666748e+02, /* 0xc322658c */
ra5 = -1.8460508728e+02, /* 0xc3389ae7 */
ra6 = -8.1287437439e+01, /* 0xc2a2932b */
ra7 = -9.8143291473e+00, /* 0xc11d077e */
sa1 = 1.9651271820e+01, /* 0x419d35ce */
sa2 = 1.3765776062e+02, /* 0x4309a863 */
sa3 = 4.3456588745e+02, /* 0x43d9486f */
sa4 = 6.4538726807e+02, /* 0x442158c9 */
sa5 = 4.2900814819e+02, /* 0x43d6810b */
sa6 = 1.0863500214e+02, /* 0x42d9451f */
sa7 = 6.5702495575e+00, /* 0x40d23f7c */
sa8 = -6.0424413532e-02, /* 0xbd777f97 */
/*
* Coefficients for approximation to erfc in [1/.35,28]
*/
rb0 = -9.8649431020e-03, /* 0xbc21a092 */
rb1 = -7.9928326607e-01, /* 0xbf4c9dd4 */
rb2 = -1.7757955551e+01, /* 0xc18e104b */
rb3 = -1.6063638306e+02, /* 0xc320a2ea */
rb4 = -6.3756646729e+02, /* 0xc41f6441 */
rb5 = -1.0250950928e+03, /* 0xc480230b */
rb6 = -4.8351919556e+02, /* 0xc3f1c275 */
sb1 = 3.0338060379e+01, /* 0x41f2b459 */
sb2 = 3.2579251099e+02, /* 0x43a2e571 */
sb3 = 1.5367296143e+03, /* 0x44c01759 */
sb4 = 3.1998581543e+03, /* 0x4547fdbb */
sb5 = 2.5530502930e+03, /* 0x451f90ce */
sb6 = 4.7452853394e+02, /* 0x43ed43a7 */
sb7 = -2.2440952301e+01; /* 0xc1b38712 */
int hx,ix;
float R,S,P,Q,s,y,z,r;
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
if(ix>=0x7f800000) { /* erfc(nan)=nan */
/* erfc(+-inf)=0,2 */
return (float)(((unsigned int)hx>>31)<<1)+one/x;
}
if(ix < 0x3f580000) { /* |x|<0.84375 */
if(ix < 0x23800000) /* |x|<2**-56 */
return one-x;
z = x*x;
r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
y = r/s;
if(hx < 0x3e800000) { /* x<1/4 */
return one-(x+x*y);
} else {
r = x*y;
r += (x-half_val);
return half_val - r ;
}
}
if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */
s = __gen_ocl_internal_fabs(x)-one;
P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
if(hx>=0) {
z = one-erx; return z - P/Q;
} else {
z = erx+P/Q; return one+z;
}
}
if (ix < 0x41e00000) { /* |x|<28 */
x = __gen_ocl_internal_fabs(x);
s = one/(x*x);
if(ix< 0x4036DB6D) { /* |x| < 1/.35 ~ 2.857143*/
R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
} else { /* |x| >= 1/.35 ~ 2.857143 */
if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
rb6, rb5), rb4), rb3), rb2), rb1), rb0);
S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
}
GEN_OCL_GET_FLOAT_WORD(ix,x);
GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
r = __gen_ocl_internal_exp(-z*z-(float)0.5625)*
__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
if(hx>0) {
float ret = r/x;
return ret;
} else
return two-r/x;
} else {
if(hx>0) {
return tiny*tiny;
} else
return two-tiny;
}
}
OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
//return x-y*__gen_ocl_rndz(x/y);
float one = 1.0;
float Zero[2];
int n,hx,hy,hz,ix,iy,sx,i;
Zero[0] = 0.0;
Zero[1] = -0.0;
GEN_OCL_GET_FLOAT_WORD(hx,x);
GEN_OCL_GET_FLOAT_WORD(hy,y);
sx = hx&0x80000000; /* sign of x */
hx ^=sx; /* |x| */
hy &= 0x7fffffff; /* |y| */
/* purge off exception values */
if(hy==0||(hx>=0x7f800000)|| /* y=0,or x not finite */
(hy>0x7f800000)) /* or y is NaN */
return (x*y)/(x*y);
if(hx>31]; /* |x|=|y| return x*0*/
/* determine ix = ilogb(x) */
if(hx<0x00800000) { /* subnormal x */
for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
} else ix = (hx>>23)-127;
/* determine iy = ilogb(y) */
if(hy<0x00800000) { /* subnormal y */
for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
} else iy = (hy>>23)-127;
/* set up {hx,lx}, {hy,ly} and align y to x */
if(ix >= -126)
hx = 0x00800000|(0x007fffff&hx);
else { /* subnormal x, shift x to normal */
n = -126-ix;
hx = hx<= -126)
hy = 0x00800000|(0x007fffff&hy);
else { /* subnormal y, shift y to normal */
n = -126-iy;
hy = hy<>31];
hx = hz+hz;
}
}
hz=hx-hy;
if(hz>=0) {hx=hz;}
/* convert back to floating value and restore the sign */
if(hx==0) /* return sign(x)*0 */
return Zero[(unsigned)sx>>31];
while(hx<0x00800000) { /* normalize x */
hx = hx+hx;
iy -= 1;
}
if(iy>= -126) { /* normalize output */
hx = ((hx-0x00800000)|((iy+127)<<23));
GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
} else { /* subnormal output */
n = -126 - iy;
hx >>= n;
GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
x *= one; /* create necessary signal */
}
return x; /* exact output */
}
OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
//return __gen_ocl_pow(M_E_F, x) - 1;
float Q1 = -3.3333335072e-02, /* 0xbd088889 */
ln2_hi = 6.9313812256e-01, /* 0x3f317180 */
ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */
Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
huge = 1.0e30,
tiny = 1.0e-30,
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
one = 1.0,
o_threshold= 8.8721679688e+01; /* 0x42b17180 */
float y,hi,lo,c,t,e,hxs,hfx,r1;
int k,xsb;
int hx;
GEN_OCL_GET_FLOAT_WORD(hx,x);
xsb = hx&0x80000000;
/* sign bit of x */
//if(xsb==0)
//y=x;
//else
//y= -x; /* y = |x| */
y = __gen_ocl_internal_fabs(x);
hx &= 0x7fffffff; /* high word of |x| */
/* filter out huge and non-finite argument */
if(hx >= 0x4195b844) { /* if |x|>=27*ln2 */
if(hx >= 0x42b17218) { /* if |x|>=88.721... */
if(hx>0x7f800000)
return x+x; /* NaN */
if(hx==0x7f800000)
return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
if(x > o_threshold)
return huge*huge; /* overflow */
}
if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
if(x+tiny<(float)0.0) /* raise inexact */
return tiny-one; /* return -1 */
}
}
/* argument reduction */
if(hx > 0x3eb17218) {/* if |x| > 0.5 ln2 */
if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
if(xsb==0){
hi = x - ln2_hi; lo = ln2_lo; k = 1;
} else {
hi = x + ln2_hi; lo = -ln2_lo; k = -1;
}
} else {
k = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
t = k;
hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
lo = t*ln2_lo;
}
x = hi - lo;
c = (hi-x)-lo;
} else if(hx < 0x33000000) { /* when |x|<2**-25, return x */
//t = huge+x; /* return x with inexact flags when x!=0 */
//return x - (t-(huge+x));
return x;
} else k = 0;
/* x is now in primary range */
hfx = (float)0.5*x;
hxs = x*hfx;
r1 = one+hxs*(Q1+hxs*Q2);
t = (float)3.0-r1*hfx;
e = hxs*((r1-t)/((float)6.0 - x*t));
if(k==0)
return x - (x*e-hxs); /* c is 0 */
else{
e = (x*(e-c)-c);
e -= hxs;
if(k== -1)return (float)0.5*(x-e)-(float)0.5;
if(k==1){
if(x < (float)-0.25)
return -(float)2.0*(e-(x+(float)0.5));
else
return (one+(float)2.0*(x-e));
}
if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
int i;
y = one-(e-x);
GEN_OCL_GET_FLOAT_WORD(i,y);
GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
return y-one;
}
t = one;
if(k<23) {
int i;
GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
y = t-(e-x);
GEN_OCL_GET_FLOAT_WORD(i,y);
GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
} else {
int i;
GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23)); /* 2^-k */
y = x-(e+t);
y += one;
GEN_OCL_GET_FLOAT_WORD(i,y);
GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23)); /* add k to y's exponent */
}
}
return y;
}
OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
//return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
float one = 1.0,
ln2 = 6.9314718246e-01;/* 0x3f317218 */
float t;
int hx;
GEN_OCL_GET_FLOAT_WORD(hx,x);
if(hx<0x3f800000) { /* x < 1 */
return (x-x)/(x-x);
} else if(hx >=0x4d800000) { /* x > 2**28 */
if(hx >=0x7f800000) {/* x is inf of NaN */
return x+x;
} else
return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
} else if (hx==0x3f800000) {
return 0.0; /* acosh(1) = 0 */
} else if (hx > 0x40000000) { /* 2**28 > x > 2 */
t=x*x;
return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
} else { /* 1one) return x; /* return x inexact except 0 */
}
if(ix>0x47000000) {/* |x| > 2**14 */
if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
} else {
float xa = __gen_ocl_internal_fabs(x);
if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
} else { /* 2.0 > |x| > 2**-14 */
float t = xa*xa;
w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
}
}
return __gen_ocl_internal_copysign(w, x);
}
OVERLOADABLE float __gen_ocl_internal_sinh(float x){
//return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
float one = 1.0,
shuge = 1.0e37;
float t,w,h;
int ix,jx;
GEN_OCL_GET_FLOAT_WORD(jx,x);
ix = jx&0x7fffffff;
/* x is INF or NaN */
if(ix>=0x7f800000) return x+x;
h = 0.5;
if (jx<0) h = -h;
/* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
if (ix < 0x41b00000) { /* |x|<22 */
if (ix<0x31800000) /* |x|<2**-28 */
if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
return h*(t+t/(t+one));
}
/* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
if (ix < 0x42b17180) return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
/* |x| in [log(maxdouble), overflowthresold] */
if (ix<=0x42b2d4fc) {
w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
t = h*w;
return t*w;
}
/* |x| > overflowthresold, sinh(x) overflow */
return x*shuge;
}
OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
//float y = native_exp(-2 * x);
//return (1 - y) / (1 + y);
float one=1.0, two=2.0, tiny = 1.0e-30;
float t,z;
int jx,ix;
GEN_OCL_GET_FLOAT_WORD(jx,x);
ix = jx&0x7fffffff;
/* x is INF or NaN */
if(ix>=0x7f800000) {
if (jx>=0)
return one/x+one; /* tanh(+-inf)=+-1 */
else
return one/x-one; /* tanh(NaN) = NaN */
}
if (ix < 0x41b00000) { /* |x|<22 */
if (ix == 0)
return x; /* x == +-0 */
if (ix<0x24000000) /* |x|<2**-55 */
return x*(one+x); /* tanh(small) = small */
if (ix>=0x3f800000) { /* |x|>=1 */
t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
z = one - two/(t+two);
} else {
t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
z= -t/(t+two);
}
} else { /* |x| > 22, return +-1 */
z = one - tiny; /* raised inexact flag */
}
return (jx>=0)? z: -z;
}
OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
//return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
float halF = 0.5,
huge = 1.0e+30,
tiny = 1.0e-30,
one = 1.0;
float t,w;
int ix;
GEN_OCL_GET_FLOAT_WORD(ix,x);
ix &= 0x7fffffff;
/* |x| in [0,22] */
if (ix < 0x41b00000) {
/* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
if(ix<0x3eb17218) {
t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
w = one+t;
if (ix<0x24000000) return w; /* cosh(tiny) = 1 */
return one+(t*t)/(w+w);
}
/* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
return halF*t+halF/t;
}
/* |x| in [22, log(maxdouble)] return half*exp(|x|) */
if (ix < 0x42b17180) return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
/* |x| in [log(maxdouble), overflowthresold] */
if (ix<=0x42b2d4fc) {
w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
t = halF*w;
return t*w;
}
/* x is INF or NaN */
if(ix>=0x7f800000) return x*x;
/* |x| > overflowthresold, cosh(x) overflow */
return huge*huge;
}
OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
//return x-y*__gen_ocl_rnde(x/y);
float zero = 0.0;
int hx,hp;
unsigned sx;
float p_half;
GEN_OCL_GET_FLOAT_WORD(hx,x);
GEN_OCL_GET_FLOAT_WORD(hp,p);
sx = hx&0x80000000;
hp &= 0x7fffffff;
hx &= 0x7fffffff;
/* purge off exception values */
if(hp==0) return (x*p)/(x*p); /* p = 0 */
if((hx>=0x7f800000)|| /* x not finite */
((hp>0x7f800000))) /* p is NaN */
return (x*p)/(x*p);
if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
if ((hx-hp)==0) return zero*x;
x = __gen_ocl_fabs(x);
p = __gen_ocl_fabs(p);
if (hp<0x01000000) {
if(x+x>p) {
x-=p;
if(x+x>=p) x -= p;
}
} else {
p_half = (float)0.5*p;
if(x>p_half) {
x-=p;
if(x>=p_half) x -= p;
}
}
GEN_OCL_GET_FLOAT_WORD(hx,x);
GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
return x;
}
OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
x = __gen_ocl_scalbnf(x,n);
return x;
}
OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
//return 0.5f * native_sqrt((1 + x) / (1 - x));
float xa = __gen_ocl_fabs (x);
float t;
if (isless (xa, 0.5f)){
if (xa < 0x1.0p-28f) return x;
t = xa + xa;
t = 0.5f * log1p (t + t * xa / (1.0f - xa));
} else if (isless (xa, 1.0f)){
t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
} else{
if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
return x / 0.0f;
}
return __gen_ocl_internal_copysign(t, x);
}
OVERLOADABLE float __gen_ocl_internal_exp10(float x){
float px, qx,ans;
short n;
int i;
float*p;
float MAXL10 = 38.230809449325611792;
float LOG210 = 3.32192809488736234787e0;
float LG102A = 3.00781250000000000000E-1;
float LG102B = 2.48745663981195213739E-4;
float P[6];
P[0] = 2.063216740311022E-001;
P[1] = 5.420251702225484E-001;
P[2] = 1.171292686296281E+000;
P[3] = 2.034649854009453E+000;
P[4] = 2.650948748208892E+000;
P[5] = 2.302585167056758E+000;
if( x < -MAXL10 ) return 0.0;
if( isinf(x)) return INFINITY;
/* The following is necessary because range reduction blows up: */
if( x == 0 )return 1.0;
/* Express 10**x = 10**g 2**n
* = 10**g 10**( n log10(2) )
* = 10**( g + n log10(2) )
*/
px = x * LOG210;
qx = __gen_ocl_internal_floor( px + 0.5 );
n = qx;
x -= qx * LG102A;
x -= qx * LG102B;
/* rational approximation for exponential
* of the fractional part:
* 10**x - 1 = 2x P(x**2)/( Q(x**2) - P(x**2) )
*/
p = P;
ans = *p++;
i = 5;
do{
ans = ans * x + *p++;
}
while( --i );
px = 1.0 + x * ans;
/* multiply by power of 2 */
x = __gen_ocl_internal_ldexp( px, n );
return x;
}
OVERLOADABLE float cospi(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cospi(x);
return __gen_ocl_internal_cospi(x);
}
OVERLOADABLE float cosh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cosh(x);
return __gen_ocl_internal_cosh(x);
}
OVERLOADABLE float acos(float x) {
return __gen_ocl_internal_acos(x);
}
OVERLOADABLE float acospi(float x) {
return __gen_ocl_internal_acospi(x);
}
OVERLOADABLE float acosh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_acosh(x);
return __gen_ocl_internal_acosh(x);
}
OVERLOADABLE float sinpi(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sinpi(x);
return __gen_ocl_internal_sinpi(x);
}
OVERLOADABLE float sinh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_sinh(x);
return __gen_ocl_internal_sinh(x);
}
OVERLOADABLE float asin(float x) {
return __gen_ocl_internal_asin(x);
}
OVERLOADABLE float asinpi(float x) {
return __gen_ocl_internal_asinpi(x);
}
OVERLOADABLE float asinh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_asinh(x);
return __gen_ocl_internal_asinh(x);
}
OVERLOADABLE float tanpi(float x) {
return __gen_ocl_internal_tanpi(x);
}
OVERLOADABLE float tanh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_tanh(x);
return __gen_ocl_internal_tanh(x);
}
OVERLOADABLE float atan(float x) {
return __gen_ocl_internal_atan(x);
}
OVERLOADABLE float atan2(float y, float x) {
return __gen_ocl_internal_atan2(y, x);
}
OVERLOADABLE float atan2pi(float y, float x) {
return __gen_ocl_internal_atan2pi(y, x);
}
OVERLOADABLE float atanpi(float x) {
return __gen_ocl_internal_atanpi(x);
}
OVERLOADABLE float atanh(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_atanh(x);
return __gen_ocl_internal_atanh(x);
}
OVERLOADABLE float cbrt(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_cbrt(x);
return __gen_ocl_internal_cbrt(x);
}
OVERLOADABLE float rint(float x) {
return __gen_ocl_internal_rint(x);
}
OVERLOADABLE float copysign(float x, float y) {
return __gen_ocl_internal_copysign(x, y);
}
OVERLOADABLE float erf(float x) {
return __gen_ocl_internal_erf(x);
}
OVERLOADABLE float erfc(float x) {
return __gen_ocl_internal_erfc(x);
}
OVERLOADABLE float fmod (float x, float y) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_fmod(x, y);
return __gen_ocl_internal_fmod(x, y);
}
OVERLOADABLE float remainder(float x, float p) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_remainder(x, p);
return __gen_ocl_internal_remainder(x, p);
}
OVERLOADABLE float ldexp(float x, int n) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_ldexp(x, n);
if (x == (float)0.0f) x = 0.0f;
return __gen_ocl_internal_ldexp(x, n);
}
CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
PURE CONST float __gen_ocl_fmax(float a, float b);
PURE CONST float __gen_ocl_fmin(float a, float b);
OVERLOADABLE float mad(float a, float b, float c) {
return __gen_ocl_mad(a, b, c);
}
OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
return a > b ? x : b > a ? y : max(x, y);
}
OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
return a < b ? x : b < a ? y : min(x, y);
}
OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
if(isnan(x))
return x;
if(isnan(y))
return y;
return x > y ? (x - y) : +0.f;
}
/*
* the pow/pown high precision implementation are copied from msun library.
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
float z,ax,z_h,z_l,p_h,p_l;
float y1,t1,t2,r,s,sn,t,u,v,w;
int i,j,k,yisint,n;
int hy,ix,iy,is;
unsigned int hx;
float bp,dp_h,dp_l,
zero = 0.0,
one = 1.0,
two = 2.0,
two24 = 16777216.0, /* 0x4b800000 */
huge = 1.0e30,
tiny = 1.0e-30,
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 6.93145752e-01, /* 0x3f317200 */
lg2_l = 1.42860654e-06, /* 0x35bfbe8c */
ovt = 4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
cp = 9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
cp_h = 9.6179199219e-01, /* 0x3f763800 =head of cp */
cp_l = 4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
bp = 1.0;//,bp1 = 1.5,
dp_h = 0.0;//,dp_h[1] = 5.84960938e-01,
dp_l = 0.0;//,dp_l[1] = 1.56322085e-06;
float retVal = 0.0f;
bool bRet = false;
hx = as_uint(x);
GEN_OCL_GET_FLOAT_WORD(hy,y);
ax = __gen_ocl_fabs(x);
ix = as_int(ax); iy = as_int(fabs(y));
if(iy < 0x00800000)
{
bRet = true;
retVal = one;
}
else if (iy > 0x7f800000)
{
bRet = true;
retVal = NAN;
}
/* determine if y is an odd int when x < 0
* yisint = 0 ... y is not an integer
* yisint = 1 ... y is an odd int
* yisint = 2 ... y is an even int
*/
sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
if(hx >= 0x7f800000)
{
yisint = 0;
n = (hx>>31)-1;
if (!retVal && ix > 0x7f800000)
{
bRet = true;
retVal = NAN;
}
if(hx >= 0x80000000) {
k = (iy>>23)-0x7f; /* exponent */
j = iy>>(23-k);
yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
yisint = (iy>=0x4b800000) ? 2:yisint;
}
/* special value of x */
if(ix==0x7f800000||ix==0||ix==0x3f800000){
z = ax; /*x is +-0,+-inf,+-1*/
z = (hy < 0)? one/z:z;
z = (((ix-0x3f800000)|yisint)==0)? NAN:z;
z = (yisint==1)? -z:z;
retVal = (bRet)? retVal:z;
bRet = true;
}
/* (x<0)**(non-int) is NaN */
if(!bRet && (n|yisint)==0)
{
bRet= true;
retVal = NAN;
}
if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
}
/* special value of x */
if((ix&0x7fffff) == 0) {
if(hx == 0x3f800000)
{
retVal = one;
bRet = true;
}
if(ix==0x7f800000||ix==0) {
z = ax; /*x is +0,+inf*/
z = (hy < 0)? one/z:z;
retVal = (bRet)? retVal:z;
bRet = true;
}
}
/* |y| is not huge */
if(iy <= 0x4d000000)
{
float s2,s_h,s_l,t_h,t_l;
n = ((ix)>>23)-0x7f;
j = ix&0x007fffff;
/* determine interval */
ix = j|0x3f800000; /* normalize ix */
if(x > 0x1.0p-6 && x < 0x1.2p7 && fabs(y) < 0x1.0p4)
{
GEN_OCL_SET_FLOAT_WORD(ax,ix);
t1 = n;
t2 = native_log2(ax);
}
else
{
n = (j >= 0x5db3d7)?n+1:n;
ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
GEN_OCL_SET_FLOAT_WORD(ax,ix);
bp = k? 1.5:bp;
dp_h = k? 5.84960938e-01:dp_h;
dp_l = k? 1.56322085e-06:dp_l;
/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */
v = one/(ax+bp);
s = u*v;
s_h = s;
GEN_OCL_GET_FLOAT_WORD(is,s_h);
GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
/* t_h=ax+bp[k] High */
is = ((ix>>1)&0xfffff000)|0x20000000;
GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
t_l = ax - (t_h-bp);
s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
/* compute log(ax) */
s2 = s*s;
r = s2*s2*(mad(s2, L2, L1));
r = mad(s_l, (s_h+s), r);
t_h = mad(s_h, s_h, 3.0f)+r;
GEN_OCL_GET_FLOAT_WORD(is,t_h);
GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
/* u+v = s*(1+...) */
u = s_h*t_h;
v = mad(s_l, t_h, t_l*s);
/* 2/(3log2)*(s+...) */
p_h = mad(s_h, t_h, v);
GEN_OCL_GET_FLOAT_WORD(is,p_h);
GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
p_l = v-(mad(-s_h, t_h, p_h));
z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
t = (float)n;
t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
GEN_OCL_GET_FLOAT_WORD(is,t1);
GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
}
}
else
{ /* if |y| > 2**27 */
/* over/underflow if x is not close to one */
/* special value of y */
float b1 = (hy>=0)? y: zero;
float b2 = (hy<0)?-y: zero;
b1 = (ix > 0x3f800000)? b1:b2;
retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
bRet = (iy==0x7f800000 && !bRet)? true: bRet;
b1 = (hy>0)? sn*huge*huge:0;
retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
bRet = (ix>0x3f800007 && !bRet)? true:bRet;
/* now |1-x| is tiny <= 2**-20, suffice to compute
log(x) by x-x^2/2+x^3/3-x^4/4 */
t = ax-1; /* t has 20 trailing zeros */
w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
v = t*ivln2_l-w*ivln2;
t1 = u+v;
GEN_OCL_GET_FLOAT_WORD(is,t1);
GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
t2 = v-(t1-u);
}
/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
GEN_OCL_GET_FLOAT_WORD(is,y);
GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
p_l = mad((y-y1), t1, y*t2);
p_h = y1*t1;
z = mad(y1, t1, p_l);
if(fabs(z) >= 128.0f)
{
if(!bRet)
{
retVal = (fabs(z) > 150.0f)? 0.0:retVal;
retVal = (z > 128.0f) ? sn*INFINITY:retVal;
retVal = (z == 128.0f && p_l+ovt>z-p_h)? sn*INFINITY:retVal;
retVal = (z == -150.0f && p_l<=z-p_h)? 0.0:retVal;
bRet = ((fabs(z) > 150.0f) || (z == -150.0f && p_l<=z-p_h) || retVal == sn*INFINITY)? true:false;
}
}
z = exp2(p_l)*exp2(p_h);
return bRet ? retVal:sn*z;
}
OVERLOADABLE float tgamma (float x)
{
/* based on glibc __ieee754_gammaf_r by Ulrich Drepper */
unsigned int hx;
GEN_OCL_GET_FLOAT_WORD(hx,x);
if (hx == 0xff800000)
{
/* x == -Inf. According to ISO this is NaN. */
return NAN;
}
if ((hx & 0x7f800000) == 0x7f800000)
{
/* Positive infinity (return positive infinity) or NaN (return
NaN). */
return x;
}
if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
{
/* integer x < 0 */
return NAN;
}
if (x >= 36.0f)
{
/* Overflow. */
return INFINITY;
}
else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
{
return 1.0f / x;
}
else
{
float sinpix = __gen_ocl_internal_sinpi(x);
if (x <= -42.0f)
/* Underflow. */
{return 0.0f * sinpix /*for sign*/;}
int exp2_adj = 0;
float x_abs = __gen_ocl_fabs(x);
float gam0;
if (x_abs < 4.0f) {
/* gamma = exp(lgamma) is only accurate for small lgamma */
float prod,x_adj;
if (x_abs < 0.5f) {
prod = 1.0f / x_abs;
x_adj = x_abs + 1.0f;
} else if (x_abs <= 1.5f) {
prod = 1.0f;
x_adj = x_abs;
} else if (x_abs < 2.5f) {
x_adj = x_abs - 1.0f;
prod = x_adj;
} else {
x_adj = x_abs - 2.0f;
prod = x_adj * (x_abs - 1.0f);
}
gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
}
else {
/* Compute gamma (X) using Stirling's approximation,
starting by computing pow (X, X) with a power of 2
factored out to avoid intermediate overflow. */
float x_int = __gen_ocl_internal_round (x_abs);
float x_frac = x_abs - x_int;
int x_log2;
float x_mant;
bool skip = false;
if (isnan(x_abs) || isinf(x_abs)) {
x_log2 = 0;
x_mant = x_abs;
skip = true;
}
uint u = as_uint(x_abs);
uint a = u & 0x7FFFFFFFu;
if (a == 0) {
x_log2 = 0;
x_mant = x_abs;
skip = true;
}
if (a >= 0x800000 && !skip) {
x_log2 = (a >> 23) - 126;
x_mant = as_float((u & (0x807FFFFFu)) | 0x3F000000);
skip = true;
}
int e = -126;
if(!skip)
{
while (a < 0x400000) {
e --;
a <<= 1;
}
a <<= 1;
x_log2 = e;
x_mant = as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
}
if (x_mant < M_SQRT1_2_F)
{
x_log2--;
x_mant *= 2.0f;
}
exp2_adj = x_log2 * (int) x_int;
float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
* exp2 (x_log2 * x_frac)
* __gen_ocl_internal_exp (-x_abs)
* sqrt (2.0f * M_PI_F / x_abs) );
float x2 = x_abs * x_abs;
float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
}
if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
float gam1 = M_PI_F / (-x * sinpix * gam0);
return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
}
}
#undef BODY
float __gen_ocl_internal_pown(float x, int y) {
const float
bp[] = {1.0, 1.5,},
dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
zero = 0.0,
one = 1.0,
two = 2.0,
two24 = 16777216.0, /* 0x4b800000 */
huge = 1.0e30,
tiny = 1.0e-30,
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 6.0000002384e-01, /* 0x3f19999a */
L2 = 4.2857143283e-01, /* 0x3edb6db7 */
P1 = 1.6666667163e-01, /* 0x3e2aaaab */
P2 = -2.7777778450e-03, /* 0xbb360b61 */
lg2 = 6.9314718246e-01, /* 0x3f317218 */
lg2_h = 0x1.62ep-1,
lg2_l = 0x1.0bfbe8p-15,
ovt = 4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
cp = 9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
cp_h = 9.6179199219e-01, /* 0x3f763800 =head of cp */
cp_l = 4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
ivln2_h = 1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
ivln2_l = 7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
float z,ax,z_h,z_l,p_h,p_l;
float y1,t1,t2,r,s,t,u,v,w;
int i,j,k,yisint,n;
int hx,ix,iy,is;
GEN_OCL_GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
/* y==zero: x**0 = 1 */
if(y==0) return one;
/* +-NaN return NAN */
if(ix > 0x7f800000)
return NAN;
/* determine if y is an odd int
* yisint = 1 ... y is an odd int
* yisint = 2 ... y is an even int
*/
yisint = y&1 ? 1 : 2;
if (y == 1) return x;
if (y == -1) return one/x;
if (y == 2) return x*x;
ax = __gen_ocl_fabs(x);
/* special value of x */
if(ix==0x7f800000||ix==0||ix==0x3f800000){
z = ax; /*x is +-0,+-inf,+-1*/
if(y<0) z = one/z; /* z = (1/|x|) */
if(hx<0) {
if(yisint==1)
z = -z; /* (x<0)**odd = -(|x|**odd) */
}
return z;
}
float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
sn = -one; /* (-ve)**(odd int) */
/* |y| is huge */
if(iy>0x08000000) { /* if |y| > 2**27 */
/* over/underflow if x is not close to one */
if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
/* now |1-x| is tiny <= 2**-20, suffice to compute
log(x) by x-x^2/2+x^3/3-x^4/4 */
t = ax-1; /* t has 20 trailing zeros */
w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
u = ivln2_h*t; /* ivln2_h has 16 sig. bits */
v = t*ivln2_l-w*ivln2;
t1 = u+v;
GEN_OCL_GET_FLOAT_WORD(is,t1);
GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
t2 = v-(t1-u);
} else {
float s2,s_h,s_l,t_h,t_l;
n = 0;
/* take care subnormal number */
// if(ix<0x00800000)
// {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
n += ((ix)>>23)-0x7f;
j = ix&0x007fffff;
/* determine interval */
ix = j|0x3f800000; /* normalize ix */
if(j<=0x1cc471) k=0; /* |x|>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
t_l = ax - (t_h-bp[k]);
s_l = v*((u-s_h*t_h)-s_h*t_l);
/* compute log(ax) */
s2 = s*s;
r = s2*s2*(L1+s2*L2);
r += s_l*(s_h+s);
s2 = s_h*s_h;
t_h = (float)3.0+s2+r;
GEN_OCL_GET_FLOAT_WORD(is,t_h);
GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
t_l = r-((t_h-(float)3.0)-s2);
/* u+v = s*(1+...) */
u = s_h*t_h;
v = s_l*t_h+t_l*s;
/* 2/(3log2)*(s+...) */
p_h = u+v;
GEN_OCL_GET_FLOAT_WORD(is,p_h);
GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
p_l = v-(p_h-u);
z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
z_l = cp_l*p_h+p_l*cp+dp_l[k];
/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
t = (float)n;
t1 = (((z_h+z_l)+dp_h[k])+t);
GEN_OCL_GET_FLOAT_WORD(is,t1);
GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
t2 = z_l-(((t1-t)-dp_h[k])-z_h);
}
/* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
float fy = (float)y;
float y3 = (float)(y-(int)fy);
GEN_OCL_GET_FLOAT_WORD(is,fy);
GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
p_h = y1*t1;
z = p_l+p_h;
GEN_OCL_GET_FLOAT_WORD(j,z);
if (j>0x43000000) /* if z > 128 */
return sn*huge*huge; /* overflow */
else if (j==0x43000000) { /* if z == 128 */
if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
}
else if ((j&0x7fffffff)>0x43160000) /* z <= -150 */
return sn*tiny*tiny; /* underflow */
else if (j==0xc3160000){ /* z == -150 */
if(p_l<=z-p_h) return sn*tiny*tiny; /* underflow */
}
/*
* compute 2**(p_h+p_l)
*/
i = j&0x7fffffff;
k = (i>>23)-0x7f;
n = 0;
if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */
n = j+(0x00800000>>(k+1));
k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */
GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
n = ((n&0x007fffff)|0x00800000)>>(23-k);
if(j<0) n = -n;
p_h -= t;
z -= n;
}
t = z;
GEN_OCL_GET_FLOAT_WORD(is,t);
GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
u = t*lg2_h;
v = (p_l-(t-p_h))*lg2+t*lg2_l;
z = u+v;
w = v-(z-u);
t = z*z;
t1 = z - t*(P1+t*P2);
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
GEN_OCL_GET_FLOAT_WORD(j,z);
j += (n<<23);
if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n); /* subnormal output */
else GEN_OCL_SET_FLOAT_WORD(z,j);
return sn*z;
}
#define BODY \
if (isnan(x) || isinf(x)) { \
*exp = 0; \
return x; \
} \
uint u = as_uint(x); \
uint a = u & 0x7FFFFFFFu; \
if (a == 0) { \
*exp = 0; \
return x; \
} \
if (a >= 0x800000) { \
*exp = (a >> 23) - 126; \
return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
} \
int e = -126; \
while (a < 0x400000) { \
e --; \
a <<= 1; \
} \
a <<= 1; \
*exp = e; \
return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
float __gen_ocl_internal_frexp(float x, int *exp) { BODY; }
float __fast_scalbnf(float x, int n) {
/* copy from fdlibm */
float two25 = 3.355443200e+07, /* 0x4c000000 */
twom25 = 2.9802322388e-08, /* 0x33000000 */
huge = 1.0e+30, tiny = 1.0e-30;
int k, ix, t, tmp;
float retVal;
GEN_OCL_GET_FLOAT_WORD(ix, x);
k = (ix & 0x7f800000) >> 23; /* extract exponent */
t = k;
k = k + n;
tmp = (ix & 0x807fffff);
x = as_float(tmp | (k << 23));
retVal = (k > 0) ? x : 0.0f;
retVal = (k > 0xfe) ? INFINITY : retVal;
retVal = (k <= -25) ? 0.0f : retVal;
x = as_float(tmp | ((k + 25) << 23));
retVal = ((k > 0) && (k <= 25)) ? x * twom25 : retVal;
retVal = (t == 0) ? 0.0f : retVal;
return retVal;
}
OVERLOADABLE float hypot(float x, float y) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_hypot(x, y);
float a, b, an, bn, cn, retVal;
int e;
if (isfinite (x) && isfinite (y)){ /* Determine absolute values. */
x = __gen_ocl_fabs (x);
y = __gen_ocl_fabs (y);
/* Find the bigger and the smaller one. */
a = max(x,y);
b = min(x,y);
bool skip = false;
uint u = as_uint(a);
uint x = u;
if (x == 0) {
e = 0;
an = x;
skip = true;
}
if (x >= 0x800000) {
e = (x >> 23) - 126;
an = as_float((u & (0x807FFFFFu)) | 0x3F000000);
skip = true;
}
if (!skip) {
int msbOne = clz(x);
x <<= (msbOne - 8);
e = -117 - msbOne;
an = as_float((x & (0x807FFFFFu)) | 0x3F000000);
}
bn = __fast_scalbnf(b, -e);
/* Through the normalization, no unneeded overflow or underflow will occur
* here. */
cn = __gen_ocl_sqrt(mad(an, an, bn * bn));
retVal = __fast_scalbnf(cn, e);
} else {
retVal = NAN; /* x or y is NaN. Return NaN. */
retVal = (isinf(x) || isinf(y))
? INFINITY
: retVal; /* x or y is infinite. Return +Infinity. */
}
return retVal;
}
OVERLOADABLE float powr(float x, float y) {
unsigned int hx, sx, hy, sy;
if (__ocl_math_fastpath_flag)
return __gen_ocl_pow(x,y);
else {
if (isnan(x) || isnan(y)) return NAN;
GEN_OCL_GET_FLOAT_WORD(hx,x);
GEN_OCL_GET_FLOAT_WORD(hy,y);
sx = (hx & 0x80000000) >> 31;
sy = (hy & 0x80000000) >> 31;
if ((hx&0x7fffffff) < 0x00800000) { /* x < 2**-126 */
x = 0.0f;/* Gen does not support subnormal number now */
hx = hx &0x80000000;
}
if ((hy&0x7fffffff) < 0x00800000) { /* y < 2**-126 */
y = 0.0;/* Gen does not support subnormal number now */
hy = hy &0x80000000;
}
// (x < 0) ** y = NAN (y!=0)
if ((sx && (hx & 0x7fffffff))) return NAN;
// +/-0 ** +/-0 = NAN
if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
// +inf ** +/-0 = NAN
if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
// others except nan/inf/0 ** 0 = 1.0
if (!(hy&0x7fffffff)) return 1.0f;
// +1 ** inf = NAN; +1 ** finite = 1;
if (hx == 0x3f800000) {
return isinf(y) ? NAN : 1.0f;
}
if ( !(hx & 0x7fffffff)) {
// +/-0 ** y<0 = +inf
// +/-0 ** y>0 = +0
return sy ? INFINITY : 0.0f;
}
return __gen_ocl_internal_pow(x,y);
}
}
OVERLOADABLE float pown(float x, int n) {
if (__ocl_math_fastpath_flag) {
if (x == 0.f && n == 0)
return 1.f;
if (x < 0.f && (n&1) )
return -powr(-x, n);
return powr(x, n);
} else {
int ix;
GEN_OCL_GET_FLOAT_WORD(ix, x);
float sign = ix < 0 ? -1.0f : 1.0f;
if (x == 0.0f) x = sign * 0.0f;
return __gen_ocl_internal_pown(x, n);
}
}
OVERLOADABLE float pow(float x, float y) {
if (!__ocl_math_fastpath_flag)
return __gen_ocl_internal_pow(x,y);
else {
int n;
if (x == 0.f && y == 0.f)
return 1.f;
if (x >= 0.f)
return powr(x, y);
n = y;
if ((float)n == y)//is exact integer
return pown(x, n);
return NAN;
}
}
OVERLOADABLE float rootn(float x, int n) {
float ax,re;
int sign = 0;
int hx;
if( n == 0 )return NAN;
GEN_OCL_GET_FLOAT_WORD(hx, x);
// Gen does not support denorm, flush to zero
if ((hx & 0x7fffffff) < 0x00800000) {
x = hx < 0 ? -0.0f : 0.0f;
}
//rootn ( x, n ) returns a NaN for x < 0 and n is even.
if( x < 0 && 0 == (n&1) )
return NAN;
if( x == 0.0 ){
switch( n & 0x80000001 ){
//rootn ( +-0, n ) is +0 for even n > 0.
case 0:
return 0.0f;
//rootn ( +-0, n ) is +-0 for odd n > 0.
case 1:
return x;
//rootn ( +-0, n ) is +inf for even n < 0.
case 0x80000000:
return INFINITY;
//rootn ( +-0, n ) is +-inf for odd n < 0.
case 0x80000001:
return __gen_ocl_internal_copysign(INFINITY, x);
}
}
ax = __gen_ocl_fabs(x);
if(x <0.0f && (n&1))
sign = 1;
if (__ocl_math_fastpath_flag)
re = __gen_ocl_pow(ax, 1.f/n);
else
re = __gen_ocl_internal_pow(ax,1.f/n);
if(sign)
re = -re;
return re;
}
OVERLOADABLE float fabs(float x) {
return __gen_ocl_internal_fabs(x);
}
OVERLOADABLE float trunc(float x) {
return __gen_ocl_internal_trunc(x);
}
OVERLOADABLE float round(float x) {
return __gen_ocl_internal_round(x);
}
OVERLOADABLE float floor(float x) {
return __gen_ocl_internal_floor(x);
}
OVERLOADABLE float ceil(float x) {
return __gen_ocl_internal_ceil(x);
}
OVERLOADABLE float log(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log(x);
/* Use native instruction when it has enough precision */
if((x > 0x1.1p0) || (x <= 0))
return __gen_ocl_internal_fastpath_log(x);
return __gen_ocl_internal_log(x);
}
OVERLOADABLE float log2(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log2(x);
/* Use native instruction when it has enough precision */
if((x > 0x1.1p0) || (x <= 0))
return __gen_ocl_internal_fastpath_log2(x);
return __gen_ocl_internal_log2(x);
}
OVERLOADABLE float log10(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_log10(x);
/* Use native instruction when it has enough precision */
if((x > 0x1.1p0) || (x <= 0))
return __gen_ocl_internal_fastpath_log10(x);
return __gen_ocl_internal_log10(x);
}
OVERLOADABLE float exp(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_exp(x);
/* Use native instruction when it has enough precision */
if (fabs(x) < 0x1.6p1)
return __gen_ocl_internal_fastpath_exp(x);
return __gen_ocl_internal_simple_exp(x);
}
OVERLOADABLE float exp2(float x) {
/* Use native instruction when it has enough precision, exp2 always */
return native_exp2(x);
}
OVERLOADABLE float exp10(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_exp10(x);
return __gen_ocl_internal_exp10(x);
}
OVERLOADABLE float expm1(float x) {
if (__ocl_math_fastpath_flag)
return __gen_ocl_internal_fastpath_expm1(x);
return __gen_ocl_internal_expm1(x);
}
OVERLOADABLE float fmin(float a, float b) {
return __gen_ocl_internal_fmin(a, b);
}
OVERLOADABLE float fmax(float a, float b) {
return __gen_ocl_internal_fmax(a, b);
}
OVERLOADABLE float fma(float a, float b, float c) {
return mad(a, b, c);
}
OVERLOADABLE float fdim(float x, float y) {
return __gen_ocl_internal_fdim(x, y);
}
OVERLOADABLE float maxmag(float x, float y) {
return __gen_ocl_internal_maxmag(x, y);
}
OVERLOADABLE float minmag(float x, float y) {
return __gen_ocl_internal_minmag(x, y);
}
OVERLOADABLE float nextafter(float x, float y) {
int hx, hy, ix, iy;
hx = as_int(x);
hy = as_int(y);
ix = hx & 0x7fffffff;
iy = hy & 0x7fffffff;
if(ix == 0)
ix = hx & 0x7fffff;
if(iy == 0)
iy = hy & 0x7fffff;
if(ix>0x7f800000 || iy>0x7f800000)
return x+y;
if(hx == hy)
return y;
if(ix == 0) {
if(iy == 0)
return y;
else
return as_float((hy&0x80000000) | 1);
}
if(hx >= 0) {
if(hx > hy) {
hx -= 1;
} else {
hx += 1;
}
} else {
if(hy >= 0 || hx > hy){
hx -= 1;
} else {
hx += 1;
}
}
return as_float(hx);
}
/* So far, the HW do not support half float math function.
We just do the conversion and call the float version here. */
OVERLOADABLE half cospi(half x) {
float _x = (float)x;
return (half)cospi(_x);
}
OVERLOADABLE half cosh(half x) {
float _x = (float)x;
return (half)cosh(_x);
}
OVERLOADABLE half acos(half x) {
float _x = (float)x;
return (half)acos(_x);
}
OVERLOADABLE float half_cos(float x) {
return (float)cos(x);
}
OVERLOADABLE float half_divide(float x, float y) {
return (float)native_divide(x, y);
}
OVERLOADABLE float half_exp(float x) {
return (float)native_exp(x);
}
OVERLOADABLE float half_exp2(float x){
return (float)native_exp2(x);
}
OVERLOADABLE float half_exp10(float x){
return (float)native_exp10(x);
}
OVERLOADABLE float half_log(float x){
return (float)native_log(x);
}
OVERLOADABLE float half_log2(float x){
return (float)native_log2(x);
}
OVERLOADABLE float half_log10(float x){
return (float)native_log10(x);
}
OVERLOADABLE float half_powr(float x, float y){
return (float)powr(x, y);
}
OVERLOADABLE float half_recip(float x){
return (float)native_recip(x);
}
OVERLOADABLE float half_rsqrt(float x){
return (float)native_rsqrt(x);
}
OVERLOADABLE float half_sin(float x){
return (float)sin(x);
}
OVERLOADABLE float half_sqrt(float x){
return (float)native_sqrt(x);
}
OVERLOADABLE float half_tan(float x){
return (float)tan(x);
}
OVERLOADABLE half acospi(half x) {
float _x = (float)x;
return (half)acospi(_x);
}
OVERLOADABLE half acosh(half x) {
float _x = (float)x;
return (half)acosh(_x);
}
OVERLOADABLE half sinpi(half x) {
float _x = (float)x;
return (half)sinpi(_x);
}
OVERLOADABLE half sinh(half x) {
float _x = (float)x;
return (half)sinh(_x);
}
OVERLOADABLE half asin(half x) {
float _x = (float)x;
return (half)asin(_x);
}
OVERLOADABLE half asinpi(half x) {
float _x = (float)x;
return (half)asinpi(_x);
}
OVERLOADABLE half asinh(half x) {
float _x = (float)x;
return (half)asinh(_x);
}
OVERLOADABLE half tanpi(half x) {
float _x = (float)x;
return (half)tanpi(_x);
}
OVERLOADABLE half tanh(half x) {
float _x = (float)x;
return (half)tanh(_x);
}
OVERLOADABLE half atan(half x) {
float _x = (float)x;
return (half)atan(_x);
}
OVERLOADABLE half atan2(half y, half x) {
float _x = (float)x;
float _y = (float)y;
return (half)atan2(_x, _y);
}
OVERLOADABLE half atan2pi(half y, half x) {
float _x = (float)x;
float _y = (float)y;
return (half)atan2pi(_x, _y);
}
OVERLOADABLE half atanpi(half x) {
float _x = (float)x;
return (half)atanpi(_x);
}
OVERLOADABLE half atanh(half x) {
float _x = (float)x;
return (half)atanh(_x);
}
OVERLOADABLE half cbrt(half x) {
float _x = (float)x;
return (half)cbrt(_x);
}
OVERLOADABLE half rint(half x) {
float _x = (float)x;
return (half)rint(_x);
}
OVERLOADABLE half copysign(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)copysign(_x, _y);
}
OVERLOADABLE half erf(half x) {
float _x = (float)x;
return (half)erf(_x);
}
OVERLOADABLE half erfc(half x) {
float _x = (float)x;
return (half)erfc(_x);
}
OVERLOADABLE half fmod(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)fmod(_x, _y);
}
OVERLOADABLE half remainder(half x, half p) {
float _x = (float)x;
float _p = (float)p;
return (half)remainder(_x, _p);
}
OVERLOADABLE half ldexp(half x, int n) {
float _x = (float)x;
return (half)ldexp(_x, n);
}
OVERLOADABLE half powr(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)powr(_x, _y);
}
OVERLOADABLE half pow(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)pow(_x, _y);
}
//no pow, we use powr instead
OVERLOADABLE half fabs(half x) {
float _x = (float)x;
return (half)fabs(_x);
}
OVERLOADABLE half trunc(half x) {
float _x = (float)x;
return (half)trunc(_x);
}
OVERLOADABLE half round(half x) {
float _x = (float)x;
return (half)round(_x);
}
OVERLOADABLE half floor(half x) {
float _x = (float)x;
return (half)floor(_x);
}
OVERLOADABLE half ceil(half x) {
float _x = (float)x;
return (half)ceil(_x);
}
OVERLOADABLE half log(half x) {
float _x = (float)x;
return (half)log(_x);
}
OVERLOADABLE half log2(half x) {
float _x = (float)x;
return (half)log2(_x);
}
OVERLOADABLE half log10(half x) {
float _x = (float)x;
return (half)log10(_x);
}
OVERLOADABLE half exp(half x) {
float _x = (float)x;
return (half)exp(_x);
}
OVERLOADABLE half exp10(half x) {
float _x = (float)x;
return (half)exp10(_x);
}
OVERLOADABLE half expm1(half x) {
float _x = (float)x;
return (half)expm1(_x);
}
OVERLOADABLE half fmin(half a, half b) {
return __gen_ocl_internal_fmin(a, b);
}
OVERLOADABLE half fmax(half a, half b) {
return __gen_ocl_internal_fmax(a, b);
}
OVERLOADABLE half fma(half a, half b, half c) {
float _a = (float)a;
float _b = (float)b;
float _c = (float)c;
return (half)fma(_a, _b, _c);
}
OVERLOADABLE half fdim(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)fdim(_x, _y);
}
OVERLOADABLE half maxmag(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)maxmag(_x, _y);
}
OVERLOADABLE half minmag(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)minmag(_x, _y);
}
OVERLOADABLE half exp2(half x) {
float _x = (float)x;
return (half)exp2(_x);
}
OVERLOADABLE half mad(half a, half b, half c) {
return __gen_ocl_mad(a,b,c);
}
OVERLOADABLE half sin(half x) {
float _x = (float)x;
return (half)sin(_x);
}
OVERLOADABLE half cos(half x) {
float _x = (float)x;
return (half)cos(_x);
}
OVERLOADABLE half tan(half x) {
float _x = (float)x;
return (half)tan(_x);
}
OVERLOADABLE half tgamma(half x) {
float _x = (float)x;
return (half)tgamma(_x);
}
OVERLOADABLE half lgamma(half x) {
float _x = (float)x;
return (half)lgamma(_x);
}
OVERLOADABLE half log1p(half x) {
float _x = (float)x;
return (half)log1p(_x);
}
OVERLOADABLE half logb(half x) {
float _x = (float)x;
return (half)logb(_x);
}
OVERLOADABLE int ilogb(half x) {
float _x = (float)x;
return ilogb(_x);
}
OVERLOADABLE half nan(ushort code) {
return (half)NAN;
}
OVERLOADABLE half sqrt(half x) {
float _x = (float)x;
return (half)sqrt(_x);
}
OVERLOADABLE half rsqrt(half x) {
float _x = (float)x;
return (half)rsqrt(_x);
}
OVERLOADABLE half nextafter(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)nextafter(_x, _y);
}
OVERLOADABLE half hypot(half x, half y) {
float _x = (float)x;
float _y = (float)y;
return (half)hypot(_x, _y);
}
OVERLOADABLE half pown(half x, int n) {
float _x = (float)x;
return (half)pown(_x, n);
}
OVERLOADABLE half rootn(half x, int n) {
float _x = (float)x;
return (half)rootn(_x, n);
}
INLINE int __HI(double x){
long x64 = as_long(x);
int high = convert_int((x64 >> 32) & 0xFFFFFFFF);
return high;
};
INLINE int __LO(double x){
long x64 = as_long(x);
int low = convert_int(x64 & 0xFFFFFFFFUL);
return low;
};
INLINE void __setHigh(double *x, int val){
long x64 = as_long(*x);
long high = x64 & 0x00000000FFFFFFFF;
high |= ((long)val << 32);
*x = as_double(high);
};
INLINE void __setLow(double *x, unsigned int val){
long x64 = as_long(*x);
long low = x64 & 0xFFFFFFFF00000000;
low |= (long)val;
*x = as_double(low);
};
OVERLOADABLE double acos(double x)
{
double one= 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
pio2_hi = 1.57079632679489655800e+00, /* 0x3FF921FB, 0x54442D18 */
pio2_lo = 6.12323399573676603587e-17, /* 0x3C91A626, 0x33145C07 */
pS0 = 1.66666666666666657415e-01, /* 0x3FC55555, 0x55555555 */
pS1 = -3.25565818622400915405e-01, /* 0xBFD4D612, 0x03EB6F7D */
pS2 = 2.01212532134862925881e-01, /* 0x3FC9C155, 0x0E884455 */
pS3 = -4.00555345006794114027e-02, /* 0xBFA48228, 0xB5688F3B */
pS4 = 7.91534994289814532176e-04, /* 0x3F49EFE0, 0x7501B288 */
pS5 = 3.47933107596021167570e-05, /* 0x3F023DE1, 0x0DFDF709 */
qS1 = -2.40339491173441421878e+00, /* 0xC0033A27, 0x1C8A2D4B */
qS2 = 2.02094576023350569471e+00, /* 0x40002AE5, 0x9C598AC8 */
qS3 = -6.88283971605453293030e-01, /* 0xBFE6066C, 0x1B8D0159 */
qS4 = 7.70381505559019352791e-02; /* 0x3FB3B8C5, 0xB12E9282 */
double z,p,q,r,w,s,c,df;
int hx,ix;
hx = __HI(x);
ix = hx&0x7fffffff;
if(ix>=0x3ff00000) { /* |x| >= 1 */
if(((ix-0x3ff00000)|__LO(x))==0) { /* |x|==1 */
if(hx>0) return 0.0; /* acos(1) = 0 */
else return pi+2.0*pio2_lo; /* acos(-1)= pi */
}
return (x-x)/(x-x); /* acos(|x|>1) is NaN */
}
if(ix<0x3fe00000) { /* |x| < 0.5 */
if(ix<=0x3c600000) return pio2_hi+pio2_lo;/*if|x|<2**-57*/
z = x*x;
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
r = p/q;
return pio2_hi - (x - (pio2_lo-x*r));
} else if (hx<0) { /* x < -0.5 */
z = (one+x)*0.5;
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
s = sqrt(z);
r = p/q;
w = r*s-pio2_lo;
return pi - 2.0*(s+w);
} else { /* x > 0.5 */
z = (one-x)*0.5;
s = sqrt(z);
df = s;
__setLow(&df, 0);
c = (z-df*df)/(s+df);
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
r = p/q;
w = r*s+c;
return 2.0*(df+w);
}
}
OVERLOADABLE double acospi(double x)
{
return acos(x)/M_PI;
}
OVERLOADABLE double acosh(double x)
{
double one = 1.0,
ln2 = 6.93147180559945286227e-01; /* 0x3FE62E42, 0xFEFA39EF */
double t;
int hx;
hx = __HI(x);
if(hx<0x3ff00000) { /* x < 1 */
return (x-x)/(x-x);
} else if(hx >=0x41b00000) { /* x > 2**28 */
if(hx >=0x7ff00000) { /* x is inf of NaN */
return x+x;
} else
return log(x)+ln2; /* acosh(huge)=log(2x) */
} else if(((hx-0x3ff00000)|__LO(x))==0) {
return 0.0; /* acosh(1) = 0 */
} else if (hx > 0x40000000) { /* 2**28 > x > 2 */
t=x*x;
return log(2.0*x-one/(x+sqrt(t-one)));
} else { /* 1= 0x3ff00000) { /* |x|>= 1 */
if(((ix-0x3ff00000)|__LO(x))==0)
/* asin(1)=+-pi/2 with inexact */
return x*pio2_hi+x*pio2_lo;
return (x-x)/(x-x); /* asin(|x|>1) is NaN */
} else if (ix<0x3fe00000) { /* |x|<0.5 */
if(ix<0x3e400000) { /* if |x| < 2**-27 */
if(huge+x>one) return x;/* return x with inexact if x!=0*/
} else
t = x*x;
p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
q = one+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
w = p/q;
return x+x*w;
}
/* 1> |x|>= 0.5 */
w = one-fabs(x);
t = w*0.5;
p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
q = one+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
s = sqrt(t);
if(ix>=0x3FEF3333) { /* if |x| > 0.975 */
w = p/q;
t = pio2_hi-(2.0*(s+s*w)-pio2_lo);
} else {
w = s;
__setLow(&w, 0);
c = (t-w*w)/(s+w);
r = p/q;
p = 2.0*s*r-(pio2_lo-2.0*c);
q = pio4_hi-2.0*w;
t = pio4_hi-(p-q);
}
if(hx>0) return t; else return -t;
}
OVERLOADABLE double asinpi(double x)
{
return asin(x)/M_PI;
}
OVERLOADABLE double asinh(double x)
{
double one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
ln2 = 6.93147180559945286227e-01, /* 0x3FE62E42, 0xFEFA39EF */
huge= 1.00000000000000000000e+300;
double t,w;
int hx,ix;
hx = __HI(x);
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) return x+x; /* x is inf or NaN */
if(ix< 0x3e300000) { /* |x|<2**-28 */
if(huge+x>one) return x; /* return x inexact except 0 */
}
if(ix>0x41b00000) { /* |x| > 2**28 */
w = log(fabs(x))+ln2;
} else if (ix>0x40000000) { /* 2**28 > |x| > 2.0 */
t = fabs(x);
w = log(2.0*t+one/(sqrt(x*x+one)+t));
} else { /* 2.0 > |x| > 2**-28 */
t = x*x;
w =log1p(fabs(x)+t/(one+sqrt(one+t)));
}
if(hx>0) return w; else return -w;
}
OVERLOADABLE double atan(double x)
{
double atanhi[] = {
4.63647609000806093515e-01, /* atan(0.5)hi 0x3FDDAC67, 0x0561BB4F */
7.85398163397448278999e-01, /* atan(1.0)hi 0x3FE921FB, 0x54442D18 */
9.82793723247329054082e-01, /* atan(1.5)hi 0x3FEF730B, 0xD281F69B */
1.57079632679489655800e+00, /* atan(inf)hi 0x3FF921FB, 0x54442D18 */
};
double atanlo[] = {
2.26987774529616870924e-17, /* atan(0.5)lo 0x3C7A2B7F, 0x222F65E2 */
3.06161699786838301793e-17, /* atan(1.0)lo 0x3C81A626, 0x33145C07 */
1.39033110312309984516e-17, /* atan(1.5)lo 0x3C700788, 0x7AF0CBBD */
6.12323399573676603587e-17, /* atan(inf)lo 0x3C91A626, 0x33145C07 */
};
double aT[] = {
3.33333333333329318027e-01, /* 0x3FD55555, 0x5555550D */
-1.99999999998764832476e-01, /* 0xBFC99999, 0x9998EBC4 */
1.42857142725034663711e-01, /* 0x3FC24924, 0x920083FF */
-1.11111104054623557880e-01, /* 0xBFBC71C6, 0xFE231671 */
9.09088713343650656196e-02, /* 0x3FB745CD, 0xC54C206E */
-7.69187620504482999495e-02, /* 0xBFB3B0F2, 0xAF749A6D */
6.66107313738753120669e-02, /* 0x3FB10D66, 0xA0D03D51 */
-5.83357013379057348645e-02, /* 0xBFADDE2D, 0x52DEFD9A */
4.97687799461593236017e-02, /* 0x3FA97B4B, 0x24760DEB */
-3.65315727442169155270e-02, /* 0xBFA2B444, 0x2C6A6C2F */
1.62858201153657823623e-02, /* 0x3F90AD3A, 0xE322DA11 */
};
double one = 1.0,
huge = 1.0e300;
double w,s1,s2,z;
int ix,hx,id;
hx = __HI(x);
ix = hx&0x7fffffff;
if(ix>=0x44100000) { /* if |x| >= 2^66 */
if(ix>0x7ff00000 ||(ix==0x7ff00000 && (__LO(x)!=0)))
return x+x; /* NaN */
if(hx>0) return atanhi[3]+atanlo[3];
else return -atanhi[3]-atanlo[3];
} if (ix < 0x3fdc0000) { /* |x| < 0.4375 */
if (ix < 0x3e200000) { /* |x| < 2^-29 */
if(huge+x>one) return x; /* raise inexact */
}
id = -1;
} else {
x = fabs(x);
if (ix < 0x3ff30000) { /* |x| < 1.1875 */
if (ix < 0x3fe60000) { /* 7/16 <=|x|<11/16 */
id = 0; x = (2.0*x-one)/(2.0+x);
} else { /* 11/16<=|x|< 19/16 */
id = 1; x = (x-one)/(x+one);
}
} else {
if (ix < 0x40038000) { /* |x| < 2.4375 */
id = 2; x = (x-1.5)/(one+1.5*x);
} else { /* 2.4375 <= |x| < 2^66 */
id = 3; x = -1.0/x;
}
}}
/* end of argument reduction */
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
if (id<0) return x - x*(s1+s2);
else {
z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
return (hx<0)? -z:z;
}
}
OVERLOADABLE double atan2(double y, double x)
{
double tiny = 1.0e-300,
zero = 0.0,
pi_o_4 = 7.8539816339744827900E-01, /* 0x3FE921FB, 0x54442D18 */
pi_o_2 = 1.5707963267948965580E+00, /* 0x3FF921FB, 0x54442D18 */
pi = 3.1415926535897931160E+00, /* 0x400921FB, 0x54442D18 */
pi_lo = 1.2246467991473531772E-16; /* 0x3CA1A626, 0x33145C07 */
double z;
int k,m,hx,hy,ix,iy;
unsigned lx,ly;
hx = __HI(x); ix = hx&0x7fffffff;
lx = __LO(x);
hy = __HI(y); iy = hy&0x7fffffff;
ly = __LO(y);
if(((ix|((lx|-lx)>>31))>0x7ff00000)||
((iy|((ly|-ly)>>31))>0x7ff00000)) /* x or y is NaN */
return x+y;
m = ((hy>>31)&1)|((hx>>30)&2); /* 2*sign(x)+sign(y) */
/* when y = 0 */
if((iy|ly)==0) {
switch(m) {
case 0:
case 1: return y; /* atan(+-0,+anything)=+-0 */
case 2: return pi+tiny;/* atan(+0,-anything) = pi */
case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
}
}
/* when x = 0 */
if((ix|lx)==0) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
/* when x is INF */
if(ix==0x7ff00000)
{
if(iy==0x7ff00000)
{
switch(m)
{
case 0: return pi_o_4+tiny;/* atan(+INF,+INF) */
case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
case 2: return 3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
case 3: return -3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
}
}
else
{
switch(m)
{
case 0: return zero ; /* atan(+...,+INF) */
case 1: return -zero; /* atan(-...,+INF) */
case 2: return pi+tiny ; /* atan(+...,-INF) */
case 3: return -pi -tiny ; /* atan(-...,-INF) */
}
}
}
/* when y is INF */
if(iy==0x7ff00000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
/* compute y/x */
k = (iy-ix)>>20;
if(k > 60) z=pi_o_2+0.5*pi_lo; /* |y/x| > 2**60 */
else if(hx<0&&k<-60) z=0.0; /* |y|/x < -2**60 */
else z=atan(fabs(y/x)); /* safe to do y/x */
switch (m) {
case 0: return z ; /* atan(+,+) */
case 1: __setHigh(&z, __HI(z) ^ 0x80000000);
return z ; /* atan(-,+) */
case 2: return pi-(z-pi_lo);/* atan(+,-) */
default: /* case 3 */
return (z-pi_lo)-pi;/* atan(-,-) */
}
}
OVERLOADABLE double atanpi(double x)
{
return atan(x)/M_PI;
}
OVERLOADABLE double atan2pi(double x, double y)
{
return atan2(x, y)/M_PI;
}
OVERLOADABLE double atanh(double x)
{
double one = 1.0, huge = 1e300;
double t, zero = 0;
int hx,ix;
unsigned lx;
hx = __HI(x); /* high word */
lx = __LO(x); /* low word */
ix = hx&0x7fffffff;
if ((ix|((lx|(-lx))>>31))>0x3ff00000) /* |x|>1 */
return (x-x)/(x-x);
if(ix==0x3ff00000)
return x/zero;
if(ix<0x3e300000&&(huge+x)>zero) return x; /* x<2**-28 */
__setHigh(&x, ix); /* x <- |x| */
if(ix<0x3fe00000) { /* x < 0.5 */
t = x+x;
t = 0.5*log1p(t+t*x/(one-x));
} else
t = 0.5*log1p((x+x)/(one-x));
if(hx>=0) return t; else return -t;
return 0.0;
}
OVERLOADABLE double exp(double x)
{
double one = 1.0,
halF[2] = {0.5,-0.5,},
huge = 1.0e+300,
twom1000= 9.33263618503218878990e-302, /* 2**-1000=0x01700000,0*/
o_threshold= 7.09782712893383973096e+02, /* 0x40862E42, 0xFEFA39EF */
u_threshold= -7.45133219101941108420e+02, /* 0xc0874910, 0xD52D3051 */
ln2HI[2] ={ 6.93147180369123816490e-01, /* 0x3fe62e42, 0xfee00000 */
-6.93147180369123816490e-01,},/* 0xbfe62e42, 0xfee00000 */
ln2LO[2] ={ 1.90821492927058770002e-10, /* 0x3dea39ef, 0x35793c76 */
-1.90821492927058770002e-10,},/* 0xbdea39ef, 0x35793c76 */
invln2 = 1.44269504088896338700e+00, /* 0x3ff71547, 0x652b82fe */
P1 = 1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */
P2 = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */
P3 = 6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */
P4 = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */
P5 = 4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */
double y,hi,lo,c,t;
int k,xsb;
unsigned hx;
hx = __HI(x); /* high word of x */
xsb = (hx>>31)&1; /* sign bit of x */
hx &= 0x7fffffff; /* high word of |x| */
/* filter out non-finite argument */
if(hx >= 0x40862E42) { /* if |x|>=709.78... */
if(hx>=0x7ff00000) {
if(((hx&0xfffff)|__LO(x))!=0)
return x+x; /* NaN */
else return (xsb==0)? x:0.0; /* exp(+-inf)={inf,0} */
}
if(x > o_threshold) return huge*huge; /* overflow */
if(x < u_threshold) return twom1000*twom1000; /* underflow */
}
/* argument reduction */
if(hx > 0x3fd62e42) { /* if |x| > 0.5 ln2 */
if(hx < 0x3FF0A2B2) { /* and |x| < 1.5 ln2 */
hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;
} else {
k = (int)(invln2*x+halF[xsb]);
t = k;
hi = x - t*ln2HI[0]; /* t*ln2HI is exact here */
lo = t*ln2LO[0];
}
x = hi - lo;
}
else if(hx < 0x3e300000) { /* when |x|<2**-28 */
if(huge+x>one) return one+x;/* trigger inexact */
}
else k = 0;
/* x is now in primary range */
t = x*x;
c = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
if(k==0) return one-((x*c)/(c-2.0)-x);
else y = one-((lo-(x*c)/(2.0-c))-hi);
if(k >= -1021) {
__setHigh(&y, __HI(y) + (k<<20)); /* add k to y's exponent */
return y;
} else {
__setHigh(&y, __HI(y) + ((k+1000)<<20));/* add k to y's exponent */
return y*twom1000;
}
}
OVERLOADABLE double expm1(double x)
{
double one = 1.0,
huge = 1.0e+300,
tiny = 1.0e-300,
o_threshold = 7.09782712893383973096e+02,/* 0x40862E42, 0xFEFA39EF */
ln2_hi = 6.93147180369123816490e-01,/* 0x3fe62e42, 0xfee00000 */
ln2_lo = 1.90821492927058770002e-10,/* 0x3dea39ef, 0x35793c76 */
invln2 = 1.44269504088896338700e+00,/* 0x3ff71547, 0x652b82fe */
/* scaled coefficients related to expm1 */
Q1 = -3.33333333333331316428e-02, /* BFA11111 111110F4 */
Q2 = 1.58730158725481460165e-03, /* 3F5A01A0 19FE5585 */
Q3 = -7.93650757867487942473e-05, /* BF14CE19 9EAADBB7 */
Q4 = 4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
Q5 = -2.01099218183624371326e-07; /* BE8AFDB7 6E09C32D */
double y,hi,lo,c,t,e,hxs,hfx,r1;
int k,xsb;
unsigned hx;
hx = __HI(x); /* high word of x */
xsb = hx&0x80000000; /* sign bit of x */
if(xsb==0) y=x; else y= -x; /* y = |x| */
hx &= 0x7fffffff; /* high word of |x| */
/* filter out huge and non-finite argument */
if(hx >= 0x4043687A) { /* if |x|>=56*ln2 */
if(hx >= 0x40862E42) { /* if |x|>=709.78... */
if(hx>=0x7ff00000) {
if(((hx&0xfffff)|__LO(x))!=0)
return x+x; /* NaN */
else return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
}
if(x > o_threshold) return huge*huge; /* overflow */
}
if(xsb!=0) { /* x < -56*ln2, return -1.0 with inexact */
if(x+tiny<0.0) /* raise inexact */
return tiny-one; /* return -1 */
}
}
/* argument reduction */
if(hx > 0x3fd62e42) { /* if |x| > 0.5 ln2 */
if(hx < 0x3FF0A2B2) { /* and |x| < 1.5 ln2 */
if(xsb==0)
{hi = x - ln2_hi; lo = ln2_lo; k = 1;}
else
{hi = x + ln2_hi; lo = -ln2_lo; k = -1;}
} else {
k = invln2*x+((xsb==0)?0.5:-0.5);
t = k;
hi = x - t*ln2_hi; /* t*ln2_hi is exact here */
lo = t*ln2_lo;
}
x = hi - lo;
c = (hi-x)-lo;
}
else if(hx < 0x3c900000) { /* when |x|<2**-54, return x */
t = huge+x; /* return x with inexact flags when x!=0 */
return x - (t-(huge+x));
}
else k = 0;
/* x is now in primary range */
hfx = 0.5*x;
hxs = x*hfx;
r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
t = 3.0-r1*hfx;
e = hxs*((r1-t)/(6.0 - x*t));
if(k==0) return x - (x*e-hxs); /* c is 0 */
else {
e = (x*(e-c)-c);
e -= hxs;
if(k== -1) return 0.5*(x-e)-0.5;
if(k==1)
{
if(x < -0.25)
return -2.0*(e-(x+0.5));
else
return one+2.0*(x-e);
}
if (k <= -2 || k>56) { /* suffice to return exp(x)-1 */
y = one-(e-x);
__setHigh(&y, __HI(y) + (k<<20));; /* add k to y's exponent */
return y-one;
}
t = one;
if(k<20) {
__setHigh(&t, 0x3ff00000 - (0x200000>>k)); /* t=1-2^-k */
y = t-(e-x);
__setHigh(&y, __HI(y) + (k<<20)); /* add k to y's exponent */
} else {
__setHigh(&t, ((0x3ff-k)<<20)); /* 2^-k */
y = x-(e+t);
y += one;
__setHigh(&y, __HI(y) + (k<<20)); /* add k to y's exponent */
}
}
return y;
}
OVERLOADABLE double exp2(double x)
{
return pow(2, x);
}
OVERLOADABLE double exp10(double x)
{
return pow(10, x);
}
OVERLOADABLE double erf(double x)
{
double erx = 8.45062911510467529297e-01, /* 0x3FEB0AC1, 0x60000000 */
efx = 1.28379167095512586316e-01, /* 0x3FC06EBA, 0x8214DB69 */
efx8= 1.02703333676410069053e+00, /* 0x3FF06EBA, 0x8214DB69 */
pp0 = 1.28379167095512558561e-01, /* 0x3FC06EBA, 0x8214DB68 */
pp1 = -3.25042107247001499370e-01, /* 0xBFD4CD7D, 0x691CB913 */
pp2 = -2.84817495755985104766e-02, /* 0xBF9D2A51, 0xDBD7194F */
pp3 = -5.77027029648944159157e-03, /* 0xBF77A291, 0x236668E4 */
pp4 = -2.37630166566501626084e-05, /* 0xBEF8EAD6, 0x120016AC */
qq1 = 3.97917223959155352819e-01, /* 0x3FD97779, 0xCDDADC09 */
qq2 = 6.50222499887672944485e-02, /* 0x3FB0A54C, 0x5536CEBA */
qq3 = 5.08130628187576562776e-03, /* 0x3F74D022, 0xC4D36B0F */
qq4 = 1.32494738004321644526e-04, /* 0x3F215DC9, 0x221C1A10 */
qq5 = -3.96022827877536812320e-06, /* 0xBED09C43, 0x42A26120 */
pa0 = -2.36211856075265944077e-03, /* 0xBF6359B8, 0xBEF77538 */
pa1 = 4.14856118683748331666e-01, /* 0x3FDA8D00, 0xAD92B34D */
pa2 = -3.72207876035701323847e-01, /* 0xBFD7D240, 0xFBB8C3F1 */
pa3 = 3.18346619901161753674e-01, /* 0x3FD45FCA, 0x805120E4 */
pa4 = -1.10894694282396677476e-01, /* 0xBFBC6398, 0x3D3E28EC */
pa5 = 3.54783043256182359371e-02, /* 0x3FA22A36, 0x599795EB */
pa6 = -2.16637559486879084300e-03, /* 0xBF61BF38, 0x0A96073F */
qa1 = 1.06420880400844228286e-01, /* 0x3FBB3E66, 0x18EEE323 */
qa2 = 5.40397917702171048937e-01, /* 0x3FE14AF0, 0x92EB6F33 */
qa3 = 7.18286544141962662868e-02, /* 0x3FB2635C, 0xD99FE9A7 */
qa4 = 1.26171219808761642112e-01, /* 0x3FC02660, 0xE763351F */
qa5 = 1.36370839120290507362e-02, /* 0x3F8BEDC2, 0x6B51DD1C */
qa6 = 1.19844998467991074170e-02, /* 0x3F888B54, 0x5735151D */
ra0 = -9.86494403484714822705e-03, /* 0xBF843412, 0x600D6435 */
ra1 = -6.93858572707181764372e-01, /* 0xBFE63416, 0xE4BA7360 */
ra2 = -1.05586262253232909814e+01, /* 0xC0251E04, 0x41B0E726 */
ra3 = -6.23753324503260060396e+01, /* 0xC04F300A, 0xE4CBA38D */
ra4 = -1.62396669462573470355e+02, /* 0xC0644CB1, 0x84282266 */
ra5 = -1.84605092906711035994e+02, /* 0xC067135C, 0xEBCCABB2 */
ra6 = -8.12874355063065934246e+01, /* 0xC0545265, 0x57E4D2F2 */
ra7 = -9.81432934416914548592e+00, /* 0xC023A0EF, 0xC69AC25C */
sa1 = 1.96512716674392571292e+01, /* 0x4033A6B9, 0xBD707687 */
sa2 = 1.37657754143519042600e+02, /* 0x4061350C, 0x526AE721 */
sa3 = 4.34565877475229228821e+02, /* 0x407B290D, 0xD58A1A71 */
sa4 = 6.45387271733267880336e+02, /* 0x40842B19, 0x21EC2868 */
sa5 = 4.29008140027567833386e+02, /* 0x407AD021, 0x57700314 */
sa6 = 1.08635005541779435134e+02, /* 0x405B28A3, 0xEE48AE2C */
sa7 = 6.57024977031928170135e+00, /* 0x401A47EF, 0x8E484A93 */
sa8 = -6.04244152148580987438e-02, /* 0xBFAEEFF2, 0xEE749A62 */
rb0 = -9.86494292470009928597e-03, /* 0xBF843412, 0x39E86F4A */
rb1 = -7.99283237680523006574e-01, /* 0xBFE993BA, 0x70C285DE */
rb2 = -1.77579549177547519889e+01, /* 0xC031C209, 0x555F995A */
rb3 = -1.60636384855821916062e+02, /* 0xC064145D, 0x43C5ED98 */
rb4 = -6.37566443368389627722e+02, /* 0xC083EC88, 0x1375F228 */
rb5 = -1.02509513161107724954e+03, /* 0xC0900461, 0x6A2E5992 */
rb6 = -4.83519191608651397019e+02, /* 0xC07E384E, 0x9BDC383F */
sb1 = 3.03380607434824582924e+01, /* 0x403E568B, 0x261D5190 */
sb2 = 3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
double sb3 = 1.53672958608443695994e+03, /* 0x409802EB, 0x189D5118 */
sb4 = 3.19985821950859553908e+03, /* 0x40A8FFB7, 0x688C246A */
sb5 = 2.55305040643316442583e+03, /* 0x40A3F219, 0xCEDF3BE6 */
sb6 = 4.74528541206955367215e+02, /* 0x407DA874, 0xE79FE763 */
sb7 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
double tiny = 1e-300;
double halfD = 5.00000000000000000000e-01;/* 0x3FE00000, 0x00000000 */
double one = 1.00000000000000000000e+00; /* 0x3FF00000, 0x00000000 */
double two = 2.00000000000000000000e+00; /* 0x40000000, 0x00000000 */
int hx,ix,i;
double R,S,P,Q,s,y,z,r;
hx = __HI(x);
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) { /* erf(nan)=nan */
i = ((unsigned)hx>>31)<<1;
return (double)(1-i)+one/x; /* erf(+-inf)=+-1 */
}
if(ix < 0x3feb0000) { /* |x|<0.84375 */
if(ix < 0x3e300000) { /* |x|<2**-28 */
if (ix < 0x00800000)
return 0.125*(8.0*x+efx8*x); /*avoid underflow */
return x + efx*x;
}
z = x*x;
r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
y = r/s;
return x + x*y;
}
if(ix < 0x3ff40000) { /* 0.84375 <= |x| < 1.25 */
s = fabs(x)-one;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
if(hx>=0) return erx + P/Q; else return -erx - P/Q;
}
if (ix >= 0x40180000) { /* inf>|x|>=6 */
if(hx>=0) return one-tiny; else return tiny-one;
}
x = fabs(x);
s = one/(x*x);
if(ix< 0x4006DB6E) { /* |x| < 1/0.35 */
R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
ra5+s*(ra6+s*ra7))))));
S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
sa5+s*(sa6+s*(sa7+s*sa8)))))));
} else { /* |x| >= 1/0.35 */
R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
rb5+s*rb6)))));
S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
sb5+s*(sb6+s*sb7))))));
}
z = x;
__setLow(&z, 0);
r = exp(-z*z-0.5625)*exp((z-x)*(z+x)+R/S);
if(hx>=0) return one-r/x; else return r/x-one;
}
OVERLOADABLE double erfc(double x)
{
double erx = 8.45062911510467529297e-01, /* 0x3FEB0AC1, 0x60000000 */
efx = 1.28379167095512586316e-01, /* 0x3FC06EBA, 0x8214DB69 */
efx8= 1.02703333676410069053e+00, /* 0x3FF06EBA, 0x8214DB69 */
pp0 = 1.28379167095512558561e-01, /* 0x3FC06EBA, 0x8214DB68 */
pp1 = -3.25042107247001499370e-01, /* 0xBFD4CD7D, 0x691CB913 */
pp2 = -2.84817495755985104766e-02, /* 0xBF9D2A51, 0xDBD7194F */
pp3 = -5.77027029648944159157e-03, /* 0xBF77A291, 0x236668E4 */
pp4 = -2.37630166566501626084e-05, /* 0xBEF8EAD6, 0x120016AC */
qq1 = 3.97917223959155352819e-01, /* 0x3FD97779, 0xCDDADC09 */
qq2 = 6.50222499887672944485e-02, /* 0x3FB0A54C, 0x5536CEBA */
qq3 = 5.08130628187576562776e-03, /* 0x3F74D022, 0xC4D36B0F */
qq4 = 1.32494738004321644526e-04, /* 0x3F215DC9, 0x221C1A10 */
qq5 = -3.96022827877536812320e-06, /* 0xBED09C43, 0x42A26120 */
pa0 = -2.36211856075265944077e-03, /* 0xBF6359B8, 0xBEF77538 */
pa1 = 4.14856118683748331666e-01, /* 0x3FDA8D00, 0xAD92B34D */
pa2 = -3.72207876035701323847e-01, /* 0xBFD7D240, 0xFBB8C3F1 */
pa3 = 3.18346619901161753674e-01, /* 0x3FD45FCA, 0x805120E4 */
pa4 = -1.10894694282396677476e-01, /* 0xBFBC6398, 0x3D3E28EC */
pa5 = 3.54783043256182359371e-02, /* 0x3FA22A36, 0x599795EB */
pa6 = -2.16637559486879084300e-03, /* 0xBF61BF38, 0x0A96073F */
qa1 = 1.06420880400844228286e-01, /* 0x3FBB3E66, 0x18EEE323 */
qa2 = 5.40397917702171048937e-01, /* 0x3FE14AF0, 0x92EB6F33 */
qa3 = 7.18286544141962662868e-02, /* 0x3FB2635C, 0xD99FE9A7 */
qa4 = 1.26171219808761642112e-01, /* 0x3FC02660, 0xE763351F */
qa5 = 1.36370839120290507362e-02, /* 0x3F8BEDC2, 0x6B51DD1C */
qa6 = 1.19844998467991074170e-02, /* 0x3F888B54, 0x5735151D */
ra0 = -9.86494403484714822705e-03, /* 0xBF843412, 0x600D6435 */
ra1 = -6.93858572707181764372e-01, /* 0xBFE63416, 0xE4BA7360 */
ra2 = -1.05586262253232909814e+01, /* 0xC0251E04, 0x41B0E726 */
ra3 = -6.23753324503260060396e+01, /* 0xC04F300A, 0xE4CBA38D */
ra4 = -1.62396669462573470355e+02, /* 0xC0644CB1, 0x84282266 */
ra5 = -1.84605092906711035994e+02, /* 0xC067135C, 0xEBCCABB2 */
ra6 = -8.12874355063065934246e+01, /* 0xC0545265, 0x57E4D2F2 */
ra7 = -9.81432934416914548592e+00, /* 0xC023A0EF, 0xC69AC25C */
sa1 = 1.96512716674392571292e+01, /* 0x4033A6B9, 0xBD707687 */
sa2 = 1.37657754143519042600e+02, /* 0x4061350C, 0x526AE721 */
sa3 = 4.34565877475229228821e+02, /* 0x407B290D, 0xD58A1A71 */
sa4 = 6.45387271733267880336e+02, /* 0x40842B19, 0x21EC2868 */
sa5 = 4.29008140027567833386e+02, /* 0x407AD021, 0x57700314 */
sa6 = 1.08635005541779435134e+02, /* 0x405B28A3, 0xEE48AE2C */
sa7 = 6.57024977031928170135e+00, /* 0x401A47EF, 0x8E484A93 */
sa8 = -6.04244152148580987438e-02, /* 0xBFAEEFF2, 0xEE749A62 */
rb0 = -9.86494292470009928597e-03, /* 0xBF843412, 0x39E86F4A */
rb1 = -7.99283237680523006574e-01, /* 0xBFE993BA, 0x70C285DE */
rb2 = -1.77579549177547519889e+01, /* 0xC031C209, 0x555F995A */
rb3 = -1.60636384855821916062e+02, /* 0xC064145D, 0x43C5ED98 */
rb4 = -6.37566443368389627722e+02, /* 0xC083EC88, 0x1375F228 */
rb5 = -1.02509513161107724954e+03, /* 0xC0900461, 0x6A2E5992 */
rb6 = -4.83519191608651397019e+02, /* 0xC07E384E, 0x9BDC383F */
sb1 = 3.03380607434824582924e+01, /* 0x403E568B, 0x261D5190 */
sb2 = 3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
double sb3 = 1.53672958608443695994e+03, /* 0x409802EB, 0x189D5118 */
sb4 = 3.19985821950859553908e+03, /* 0x40A8FFB7, 0x688C246A */
sb5 = 2.55305040643316442583e+03, /* 0x40A3F219, 0xCEDF3BE6 */
sb6 = 4.74528541206955367215e+02, /* 0x407DA874, 0xE79FE763 */
sb7 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
double tiny = 1e-300;
double halfD = 5.00000000000000000000e-01;/* 0x3FE00000, 0x00000000 */
double one = 1.00000000000000000000e+00; /* 0x3FF00000, 0x00000000 */
double two = 2.00000000000000000000e+00; /* 0x40000000, 0x00000000 */
int hx,ix;
double R,S,P,Q,s,y,z,r;
hx = __HI(x);
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) { /* erfc(nan)=nan */
/* erfc(+-inf)=0,2 */
return (double)(((unsigned)hx>>31)<<1)+one/x;
}
if(ix < 0x3feb0000) { /* |x|<0.84375 */
if(ix < 0x3c700000) /* |x|<2**-56 */
return one-x;
z = x*x;
r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
y = r/s;
if(hx < 0x3fd00000) { /* x<1/4 */
return one-(x+x*y);
} else {
r = x*y;
r += (x-halfD);
return halfD - r ;
}
}
if(ix < 0x3ff40000) { /* 0.84375 <= |x| < 1.25 */
s = fabs(x)-one;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
if(hx>=0) {
z = one-erx; return z - P/Q;
} else {
z = erx+P/Q; return one+z;
}
}
if (ix < 0x403c0000) { /* |x|<28 */
x = fabs(x);
s = one/(x*x);
if(ix< 0x4006DB6D) { /* |x| < 1/.35 ~ 2.857143*/
R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
ra5+s*(ra6+s*ra7))))));
S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
sa5+s*(sa6+s*(sa7+s*sa8)))))));
} else { /* |x| >= 1/.35 ~ 2.857143 */
if(hx<0&&ix>=0x40180000) return two-tiny;/* x < -6 */
R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
rb5+s*rb6)))));
S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
sb5+s*(sb6+s*sb7))))));
}
z = x;
__setLow(&z, 0);
r = exp(-z*z-0.5625)*
exp((z-x)*(z+x)+R/S);
if(hx>0) return r/x; else return two-r/x;
} else {
if(hx>0) return tiny*tiny; else return two-tiny;
}
}
OVERLOADABLE double cbrt(double x)
{
double B1 = 715094163, /* B1 = (682-0.03306235651)*2**20 */
B2 = 696219795; /* B2 = (664-0.03306235651)*2**20 */
double C = 5.42857142857142815906e-01, /* 19/35 = 0x3FE15F15, 0xF15F15F1 */
D = -7.05306122448979611050e-01, /* -864/1225 = 0xBFE691DE, 0x2532C834 */
E = 1.41428571428571436819e+00, /* 99/70 = 0x3FF6A0EA, 0x0EA0EA0F */
F = 1.60714285714285720630e+00, /* 45/28 = 0x3FF9B6DB, 0x6DB6DB6E */
G = 3.57142857142857150787e-01; /* 5/14 = 0x3FD6DB6D, 0xB6DB6DB7 */
int hx;
double r,s,t=0.0,w;
unsigned sign;
hx = __HI(x); /* high word of x */
sign=hx&0x80000000; /* sign= sign(x) */
hx ^=sign;
if(hx>=0x7ff00000) return(x+x); /* cbrt(NaN,INF) is itself */
if((hx|__LO(x))==0)
return(x); /* cbrt(0) is itself */
__setHigh(&x, hx); /* x <- |x| */
/* rough cbrt to 5 bits */
if(hx<0x00100000) /* subnormal number */
{
__setHigh(&t, 0x43500000); /* set t= 2**54 */
t*=x;
__setHigh(&t, __HI(t)/3+B2);
}
else
__setHigh(&t, hx/3+B1);
/* new cbrt to 23 bits, may be implemented in single precision */
r=t*t/x;
s=C+r*t;
t*=G+F/(s+E+D/s);
/* chopped to 20 bits and make it larger than cbrt(x) */
__setLow(&t, 0); __setHigh(&t, __HI(t)+0x00000001);
/* one step newton iteration to 53 bits with error less than 0.667 ulps */
s=t*t; /* t*t is exact */
r=x/s;
w=t+t;
r=(r-t)/(w+r); /* r-s is exact */
t=t+t*r;
/* retore the sign bit */
__setHigh(&t, __HI(t)|sign);
return(t);
}
OVERLOADABLE double ceil(double x)
{
double ret;
ulong lval = as_ulong(x);
int exp = ((lval >> 52) & 0x7FF) - 1023;
int sign = (lval >> 63) & 0x1;
long i = (1L << (52 - exp));
long mask = 0x10000000000000 - i;
unsigned long uv = 0xFFF0000000000000 + mask;
ulong vv = lval & uv;
double dp = as_double(vv);
ret = ((vv != lval) && !sign) ? dp +1.0:dp;
ret = ((exp < 0) && sign) ? 0:ret;
double tmp = (lval & DF_ABS_MASK) ? 1.0:0.0;
ret = ((exp < 0) && !sign) ? tmp:ret;
ret = (exp >= 52) ? x:ret;
return ret;
}
OVERLOADABLE double copysign(double x, double y)
{
ulong uy = as_ulong(y);
ulong sign = uy & DF_SIGN_MASK;
ulong ux = as_ulong(x);
ux = (ux & DF_ABS_MASK) | sign;
return as_double(ux);
}
double __scalbn (double x, int n)
{
double two54 = 1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
twom54 = 5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
huge = 1.0e+300,
tiny = 1.0e-300;
int k,hx,lx;
hx = __HI(x);
lx = __LO(x);
k = (hx&0x7ff00000)>>20; /* extract exponent */
if (k==0) { /* 0 or subnormal x */
if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
x *= two54;
hx = __HI(x);
k = ((hx&0x7ff00000)>>20) - 54;
if (n< -50000) return tiny*x; /*underflow*/
}
if (k==0x7ff) return x+x; /* NaN or Inf */
k = k+n;
if (k > 0x7fe) return huge*copysign(huge,x); /* overflow */
if (k > 0) /* normal result */
{__setHigh(&x, (hx&0x800fffff)|(k<<20)); return x;}
if (k <= -54)
{
if (n > 50000) /* in case integer overflow in n+k */
return huge*copysign(huge,x); /*overflow*/
else return tiny*copysign(tiny,x); /*underflow*/
}
k += 54; /* subnormal result */
__setHigh(&x, (hx&0x800fffff)|(k<<20));
return x*twom54;
}
int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int *ipio2)
{
double zero = 0.0,
one = 1.0,
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
double PIo2[] = {
1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
};
const int init_jk[] = {2,3,4,6}; /* initial value for jk */
int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
double z,fw,f[20],fq[20],q[20];
/* initialize jk*/
jk = init_jk[prec];
jp = jk;
/* determine jx,jv,q0, note that 3>q0 */
jx = nx-1;
jv = (e0-3)/24; if(jv<0) jv=0;
q0 = e0-24*(jv+1);
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
j = jv-jx; m = jx+jk;
for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (double) ipio2[j];
/* compute q[0],q[1],...q[jk] */
for (i=0;i<=jk;i++) {
for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
}
jz = jk;
recompute:
/* distill q[] into iq[] reversingly */
for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
fw = (double)((int)(twon24* z));
iq[i] = (int)(z-two24*fw);
z = q[j-1]+fw;
}
/* compute n */
z = __scalbn(z,q0); /* actual value of z */
z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */
n = (int) z;
z -= (double)n;
ih = 0;
if(q0>0) { /* need iq[jz-1] to determine n */
i = (iq[jz-1]>>(24-q0)); n += i;
iq[jz-1] -= i<<(24-q0);
ih = iq[jz-1]>>(23-q0);
}
else if(q0==0) ih = iq[jz-1]>>23;
else if(z>=0.5) ih=2;
if(ih>0) { /* q > 0.5 */
n += 1; carry = 0;
for(i=0;i0) { /* rare case: chance is 1 in 12 */
switch(q0) {
case 1:
iq[jz-1] &= 0x7fffff; break;
case 2:
iq[jz-1] &= 0x3fffff; break;
}
}
if(ih==2) {
z = one - z;
if(carry!=0) z -= __scalbn(one,q0);
}
}
/* check if recomputation is needed */
if(z==zero) {
j = 0;
for (i=jz-1;i>=jk;i--) j |= iq[i];
if(j==0) { /* need recomputation */
for(k=1;iq[jk-k]==0;k++); /* k = no. of terms needed */
for(i=jz+1;i<=jz+k;i++) { /* add q[jz+1] to q[jz+k] */
f[jx+i] = (double) ipio2[jv+i];
for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
q[i] = fw;
}
jz += k;
goto recompute;
}
}
/* chop off zero terms */
if(z==0.0) {
jz -= 1; q0 -= 24;
while(iq[jz]==0) { jz--; q0-=24;}
} else { /* break z into 24-bit if necessary */
z = __scalbn(z,-q0);
if(z>=two24) {
fw = (double)((int)(twon24*z));
iq[jz] = (int)(z-two24*fw);
jz += 1; q0 += 24;
iq[jz] = (int) fw;
} else iq[jz] = (int) z ;
}
/* convert integer "bit" chunk to floating-point value */
fw = __scalbn(one,q0);
for(i=jz;i>=0;i--) {
q[i] = fw*(double)iq[i]; fw*=twon24;
}
/* compute PIo2[0,...,jp]*q[jz,...,0] */
for(i=jz;i>=0;i--) {
for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
fq[jz-i] = fw;
}
/* compress fq[] into y[] */
switch(prec) {
case 0:
fw = 0.0;
for (i=jz;i>=0;i--) fw += fq[i];
y[0] = (ih==0)? fw: -fw;
break;
case 1:
case 2:
fw = 0.0;
for (i=jz;i>=0;i--) fw += fq[i];
y[0] = (ih==0)? fw: -fw;
fw = fq[0]-fw;
for (i=1;i<=jz;i++) fw += fq[i];
y[1] = (ih==0)? fw: -fw;
break;
case 3: /* painful */
for (i=jz;i>0;i--) {
fw = fq[i-1]+fq[i];
fq[i] += fq[i-1]-fw;
fq[i-1] = fw;
}
for (i=jz;i>1;i--) {
fw = fq[i-1]+fq[i];
fq[i] += fq[i-1]-fw;
fq[i-1] = fw;
}
for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
if(ih==0) {
y[0] = fq[0]; y[1] = fq[1]; y[2] = fw;
} else {
y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
}
}
return n&7;
}
int __ieee754_rem_pio2(double x, double *y)
{
int two_over_pi[] = {
0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
};
int npio2_hw[] = {
0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
0x404858EB, 0x404921FB,
};
double zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
halfD = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
double z,w,t,r,fn;
double tx[3];
int e0,i,j,nx,n,ix,hx;
hx = __HI(x); /* high word of x */
ix = hx&0x7fffffff;
if(ix<=0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
{y[0] = x; y[1] = 0; return 0;}
if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */
if(hx>0) {
z = x - pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z - pio2_1t;
y[1] = (z-y[0])-pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
z -= pio2_2;
y[0] = z - pio2_2t;
y[1] = (z-y[0])-pio2_2t;
}
return 1;
} else { /* negative x */
z = x + pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z + pio2_1t;
y[1] = (z-y[0])+pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
z += pio2_2;
y[0] = z + pio2_2t;
y[1] = (z-y[0])+pio2_2t;
}
return -1;
}
}
if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
t = fabs(x);
n = (int) (t*invpio2+halfD);
fn = (double)n;
r = t-fn*pio2_1;
w = fn*pio2_1t; /* 1st round good to 85 bit */
if(n<32&&ix!=npio2_hw[n-1]) {
y[0] = r-w; /* quick check no cancellation */
} else {
j = ix>>20;
y[0] = r-w;
i = j-(((__HI(y[0]))>>20)&0x7ff);
if(i>16) { /* 2nd iteration needed, good to 118 */
t = r;
w = fn*pio2_2;
r = t-w;
w = fn*pio2_2t-((t-r)-w);
y[0] = r-w;
i = j-(((__HI(y[0]))>>20)&0x7ff);
if(i>49) { /* 3rd iteration need, 151 bits acc */
t = r; /* will cover all possible cases */
w = fn*pio2_3;
r = t-w;
w = fn*pio2_3t-((t-r)-w);
y[0] = r-w;
}
}
}
y[1] = (r-y[0])-w;
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
else return n;
}
/*
* all other (large) arguments
*/
if(ix>=0x7ff00000) { /* x is inf or NaN */
y[0]=y[1]=x-x; return 0;
}
/* set z = scalbn(|x|,ilogb(x)-23) */
__setLow(&z, __LO(x));
e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */
__setHigh(&z, ix - (e0<<20));
for(i=0;i<2;i++) {
tx[i] = (double)((int)(z));
z = (z-tx[i])*two24;
}
tx[2] = z;
nx = 3;
while(tx[nx-1]==zero) nx--; /* skip zero term */
n = __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
return n;
}
double __kernel_cos(double x, double y)
{
double one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
C1 = 4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
C2 = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
C3 = 2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
C4 = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
C5 = 2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
C6 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
double a,hz,z,r,qx;
int ix;
ix = __HI(x)&0x7fffffff; /* ix = |x|'s high word*/
if(ix<0x3e400000) { /* if x < 2**27 */
if(((int)x)==0) return one; /* generate inexact */
}
z = x*x;
r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
if(ix < 0x3FD33333) /* if |x| < 0.3 */
return one - (0.5*z - (z*r - x*y));
else {
if(ix > 0x3fe90000) { /* x > 0.78125 */
qx = 0.28125;
} else {
__setHigh(&qx, ix-0x00200000); /* x/4 */
__setLow(&qx, 0);
}
hz = 0.5*z-qx;
a = one-qx;
return a - (hz - (z*r-x*y));
}
}
double __kernel_sin(double x, double y, int iy)
{
double halfD = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
S1 = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
S2 = 8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
S3 = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
S4 = 2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
S5 = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
S6 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
double z,r,v;
int ix;
ix = __HI(x)&0x7fffffff; /* high word of x */
if(ix<0x3e400000) /* |x| < 2**-27 */
{if((int)x==0) return x;} /* generate inexact */
z = x*x;
v = z*x;
r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
if(iy==0) return x+v*(S1+z*r);
else return x-((z*(halfD*y-v*r)-y)-v*S1);
}
OVERLOADABLE double cos(double x)
{
double y[2],z=0.0;
int n, ix;
/* High word of x. */
ix = __HI(x);
/* |x| ~< pi/4 */
ix &= 0x7fffffff;
if(ix <= 0x3fe921fb) return __kernel_cos(x,z);
/* cos(Inf or NaN) is NaN */
else if (ix>=0x7ff00000) return x-x;
/* argument reduction needed */
else {
n = __ieee754_rem_pio2(x,y);
switch(n&3) {
case 0: return __kernel_cos(y[0],y[1]);
case 1: return -__kernel_sin(y[0],y[1],1);
case 2: return -__kernel_cos(y[0],y[1]);
default:
return __kernel_sin(y[0],y[1],1);
}
}
}
OVERLOADABLE double cosh(double x)
{
double one = 1.0, dHalf=0.5, huge = 1.0e300;
double t,w;
int ix;
unsigned lx;
/* High word of |x|. */
ix = __HI(x);
ix &= 0x7fffffff;
/* x is INF or NaN */
if(ix>=0x7ff00000) return x*x;
/* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
if(ix<0x3fd62e43) {
t = expm1(fabs(x));
w = one+t;
if (ix<0x3c800000) return w; /* cosh(tiny) = 1 */
return one+(t*t)/(w+w);
}
/* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
if (ix < 0x40360000) {
t = exp(fabs(x));
return dHalf*t+dHalf/t;
}
/* |x| in [22, log(maxdouble)] return half*exp(|x|) */
if (ix < 0x40862E42) return dHalf*exp(fabs(x));
/* |x| in [log(maxdouble), overflowthresold] */
lx = *( (((*(unsigned*)&one)>>29)) + (unsigned*)&x);
if (ix<0x408633CE ||
((ix==0x408633ce)&&(lx<=(unsigned)0x8fb9f87d))) {
w = exp(dHalf*fabs(x));
t = dHalf*w;
return t*w;
}
/* |x| > overflowthresold, cosh(x) overflow */
return huge*huge;
}
OVERLOADABLE double cospi(double x)
{
double y,z, signValue = 1.0;
int ix;
ix = 0x7fffffff&__HI(x);
if(ix<0x3fd00000) return __kernel_cos(M_PI*x,0);
y = fabs(x);
if(ix >=0x7ff00000)return as_double(DF_POSITIVE_INF + 1);
z = floor(y);
if(z > 0)
{
ulong uz = as_ulong(z);
ulong expValue = ((uz & DF_EXP_MASK) >> DF_EXP_OFFSET) - DF_EXP_BIAS;
ulong manValue = ((uz & DF_MAN_MASK) | DF_IMPLICITE_ONE);
if(expValue > 52) return 1.0;
manValue = (manValue >> (52 - expValue));
if(manValue & 1)
signValue = -1.0;
}
if(z!=y)
{ /* inexact anyway */
y = y -z;
if(y > 0.5)
{
y = y - 0.5;
signValue *= -1.0;
return signValue * sin(y*M_PI);
}
}
else
{
return signValue;
}
//the precison of sin is better than cos
y = signValue * sin((0.5 - y)*M_PI);
return y;
}
OVERLOADABLE double fabs(double x)
{
long qw = as_ulong(x);
qw &= 0x7FFFFFFFFFFFFFFF;
return as_double(qw);
}
OVERLOADABLE double fdim(double x, double y)
{
if(isnan(x))
return x;
if(isnan(y))
return y;
return x > y ? (x - y) : +0.f;
}
OVERLOADABLE double maxmag(double x, double y)
{
if(fabs(x) > fabs(y)) return x;
if(fabs(y) > fabs(x)) return y;
return fmax(x, y);
}
OVERLOADABLE double minmag(double x, double y)
{
if(fabs(x) < fabs(y)) return x;
if(fabs(y) < fabs(x)) return y;
return fmin(x, y);
}
OVERLOADABLE double ldexp(double x, int n)
{
ulong ux = as_ulong(x);
if(((ux&DF_ABS_MASK) >= DF_POSITIVE_INF) ||x==0.0) return x;
x = __scalbn(x,n);
return x;
}
OVERLOADABLE double floor(double x)
{
ulong lval = as_ulong(x);
int exp = ((lval >> 52) & 0x7FF) - 1023;
int sign = (lval >> 63) & 0x1;
if(exp < 0)
{
if(sign)
{
if(lval & DF_ABS_MASK)
return -1L;
else
return 0.0;
}
else return 0.0;
}
else
{
if(exp >= 52)
return x;
long i = (1L << (52 - exp));
i = 0x10000000000000 - i;
unsigned long uv = 0xFFF0000000000000 + i;
ulong vv = lval & uv;
double dp = as_double(vv);
if(vv != lval)
dp -= sign;
return dp;
}
}
/*-
* Copyright (c) 2005-2011 David Schultz
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
struct dd {
double hi;
double lo;
};
static inline struct dd
dd_add(double a, double b)
{
struct dd ret;
double s;
ret.hi = a + b;
s = ret.hi - a;
ret.lo = (a - (ret.hi - s)) + (b - s);
return (ret);
}
static inline double
add_adjusted(double a, double b)
{
struct dd sum;
ulong hibits, lobits;
sum = dd_add(a, b);
if (sum.lo != 0) {
hibits = as_long(sum.hi);
if ((hibits & 1) == 0) {
/* hibits += (int)copysign(1.0, sum.hi * sum.lo) */
lobits = as_long(sum.lo);
hibits += 1 - ((hibits ^ lobits) >> 62);
sum.hi = as_double(hibits);
}
}
return (sum.hi);
}
static inline double
add_and_denormalize(double a, double b, int scale)
{
struct dd sum;
long hibits, lobits;
int bits_lost;
sum = dd_add(a, b);
if (sum.lo != 0) {
hibits = as_long(sum.hi);
bits_lost = -((int)(hibits >> 52) & 0x7ff) - scale + 1;
if ((bits_lost != 1) ^ (int)(hibits & 1)) {
/* hibits += (int)copysign(1.0, sum.hi * sum.lo) */
lobits = as_long(sum.lo);
hibits += 1 - (((hibits ^ lobits) >> 62) & 2);
sum.hi = as_double(hibits);
}
}
return (ldexp(sum.hi, scale));
}
static inline struct dd
dd_mul(double a, double b)
{
const double split = 0x1p27 + 1.0;
struct dd ret;
double ha, hb, la, lb, p, q;
p = a * split;
ha = a - p;
ha += p;
la = a - ha;
p = b * split;
hb = b - p;
hb += p;
lb = b - hb;
p = ha * hb;
q = ha * lb + la * hb;
ret.hi = p + q;
ret.lo = p - ret.hi + q + la * lb;
return (ret);
}
double __frexp(double x, int *exp)
{
double two54 = 1.80143985094819840000e+16; /* 0x43500000, 0x00000000 */
int hx, ix, lx;
hx = __HI(x);
ix = 0x7fffffff&hx;
lx = __LO(x);
*exp = 0;
if(ix>=0x7ff00000||((ix|lx)==0)) return x; /* 0,inf,nan */
if (ix<0x00100000) { /* subnormal */
x *= two54;
hx = __HI(x);
ix = hx&0x7fffffff;
*exp = -54;
}
*exp += (ix>>20)-1022;
hx = (hx&0x800fffff)|0x3fe00000;
__setHigh(&x, hx);
return x;
}
OVERLOADABLE double fma(double x, double y, double z)
{
double xs, ys, zs, adj;
struct dd xy, r;
int oround;
int ex, ey, ez;
int spread;
if (x == 0.0 || y == 0.0)
return (x * y + z);
if (z == 0.0)
return (x * y);
if (!isfinite(x) || !isfinite(y))
return (x * y + z);
if (!isfinite(z))
return (z);
xs = __frexp(x, &ex);
ys = __frexp(y, &ey);
zs = __frexp(z, &ez);
spread = ex + ey - ez;
if (spread < -53) {
return (z);
}
if (spread <= 53 * 2)
zs = ldexp(zs, -spread);
else
zs = copysign(2.225073858507201383090e-308, zs);
xy = dd_mul(xs, ys);
r = dd_add(xy.hi, zs);
spread = ex + ey;
if (r.hi == 0.0) {
return (xy.hi + zs + ldexp(xy.lo, spread));
}
adj = add_adjusted(r.lo, xy.lo);
double base = r.hi + adj;
if (spread + ilogb(r.hi) > -1023)
return (ldexp(base, spread));
else
return (add_and_denormalize(r.hi, adj, spread));
}
OVERLOADABLE double hypot(double x, double y)
{
double a=x,b=y,t1,t2,y1,y2,w;
int j,k,ha,hb;
ha = __HI(x)&0x7fffffff; /* high word of x */
hb = __HI(y)&0x7fffffff; /* high word of y */
if(hb > ha) {a=y;b=x;j=ha; ha=hb;hb=j;} else {a=x;b=y;}
__setHigh(&a, ha); /* a <- |a| */
__setHigh(&b, hb); /* b <- |b| */
if((ha-hb)>0x3c00000) {return a+b;} /* x/y > 2**60 */
k=0;
if(ha > 0x5f300000) { /* a>2**500 */
if(ha >= 0x7ff00000) { /* Inf or NaN */
w = a+b; /* for sNaN */
if(((ha&0xfffff)|__LO(a))==0) w = a;
if(((hb^0x7ff00000)|__LO(b))==0) w = b;
return w;
}
/* scale a and b by 2**-600 */
ha -= 0x25800000; hb -= 0x25800000; k += 600;
__setHigh(&a, ha);
__setHigh(&b, hb);
}
if(hb < 0x20b00000) { /* b < 2**-500 */
if(hb <= 0x000fffff) { /* subnormal b or 0 */
if((hb|(__LO(b)))==0) return a;
t1=0;
__setHigh(&t1, 0x7fd00000); /* t1=2^1022 */
b *= t1;
a *= t1;
k -= 1022;
} else { /* scale a and b by 2^600 */
ha += 0x25800000; /* a *= 2^600 */
hb += 0x25800000; /* b *= 2^600 */
k -= 600;
__setHigh(&a, ha);
__setHigh(&b, hb);
}
}
/* medium size a and b */
w = a-b;
if (w>b) {
t1 = 0;
__setHigh(&t1, ha);
t2 = a-t1;
w = sqrt(t1*t1-(b*(-b)-t2*(a+t1)));
} else {
a = a+a;
y1 = 0;
__setHigh(&y1, hb);
y2 = b - y1;
t1 = 0;
__setHigh(&t1, ha+0x00100000);
t2 = a - t1;
w = sqrt(t1*y1-(w*(-w)-(t1*y2+t2*b)));
}
if(k!=0) {
t1 = 1.0;
__setHigh(&t1, __HI(t1) + (k<<20));
return t1*w;
} else return w;
}
OVERLOADABLE double log(double x)
{
double ln2_hi = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */
ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
two54 = 1.80143985094819840000e+16, /* 43500000 00000000 */
Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
double zero = 0;
double hfsq,f,s,z,R,w,t1,t2,dk;
int k,hx,i,j;
unsigned lx;
hx = __HI(x); /* high word of x */
lx = __LO(x); /* low word of x */
k=0;
if (hx < 0x00100000)
{ /* x < 2**-1022 */
if (((hx&0x7fffffff)|lx)==0)
return -two54/zero; /* log(+-0)=-inf */
if (hx<0)
return (x-x)/zero; /* log(-#) = NaN */
k -= 54; x *= two54; /* subnormal number, scale up x */
hx = __HI(x); /* high word of x */
}
if (hx >= 0x7ff00000) return x+x;
k += (hx>>20)-1023;
hx &= 0x000fffff;
i = (hx+0x95f64)&0x100000;
__setHigh(&x, (hx|(i^0x3ff00000))); /* normalize x or x/2 */
k += (i>>20);
f = x-1.0;
if((0x000fffff&(2+hx))<3) { /* |f| < 2**-20 */
if(f==zero)
{
if(k==0) return zero;
else
{
dk=(double)k;
return dk*ln2_hi+dk*ln2_lo;
}
}
R = f*f*(0.5-0.33333333333333333*f);
if(k==0)
return f-R;
else {dk=(double)k;
return dk*ln2_hi-((R-dk*ln2_lo)-f);}
}
s = f/(2.0+f);
dk = (double)k;
z = s*s;
i = hx-0x6147a;
w = z*z;
j = 0x6b851-hx;
t1= w*(Lg2+w*(Lg4+w*Lg6));
t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
i |= j;
R = t2+t1;
if(i>0) {
hfsq=0.5*f*f;
if(k==0)
return f-(hfsq-s*(hfsq+R));
else
return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
}
else
{
if(k==0)
return f-s*(f-R);
else
return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
}
}
OVERLOADABLE double log2(double x)
{
double ln2 = 0.69314718055994530942,
zero = 0,
two54 = 1.80143985094819840000e+16, /* 43500000 00000000 */
Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
double hfsq,f,s,z,R,w,t1,t2,dk;
int k,hx,i,j;
uint lx;
hx = __HI(x);
lx = __LO(x);
k=0;
if (hx < 0x00100000)
{ /* x < 2**-1022 */
if (((hx&0x7fffffff)|lx)==0)
return -two54/(x-x); /* log(+-0)=-inf */
if (hx<0) return (x-x)/(x-x); /* log(-#) = NaN */
k -= 54; x *= two54; /* subnormal number, scale up x */
hx = __HI(x);
}
if (hx >= 0x7ff00000) return x+x;
k += (hx>>20)-1023;
hx &= 0x000fffff;
i = (hx+0x95f64)&0x100000;
__setHigh(&x,hx|(i^0x3ff00000)); /* normalize x or x/2 */
k += (i>>20);
dk = (double) k;
f = x-1.0;
if((0x000fffff&(2+hx))<3)
{ /* |f| < 2**-20 */
if(f==zero) return dk;
R = f*f*(0.5-0.33333333333333333*f);
return dk-(R-f)/ln2;
}
s = f/(2.0+f);
z = s*s;
i = hx-0x6147a;
w = z*z;
j = 0x6b851-hx;
t1= w*(Lg2+w*(Lg4+w*Lg6));
t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
i |= j;
R = t2+t1;
if(i>0)
{
hfsq=0.5*f*f;
return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
}
else
{
return dk-((s*(f-R))-f)/ln2;
}
}
OVERLOADABLE double log10(double x)
{
double zero = 0.0,
two54 = 1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
ivln10 = 4.34294481903251816668e-01, /* 0x3FDBCB7B, 0x1526E50E */
log10_2hi = 3.01029995663611771306e-01, /* 0x3FD34413, 0x509F6000 */
log10_2lo = 3.69423907715893078616e-13; /* 0x3D59FEF3, 0x11F12B36 */
double y,z;
int i,k,hx;
unsigned lx;
hx = __HI(x); /* high word of x */
lx = __LO(x); /* low word of x */
k=0;
if (hx < 0x00100000)
{ /* x < 2**-1022 */
if (((hx&0x7fffffff)|lx)==0)
return -two54/zero; /* log(+-0)=-inf */
if (hx<0)
return (x-x)/zero;/* log(-#) = NaN */
k -= 54; x *= two54; /* subnormal number, scale up x */
hx = __HI(x);/* high word of x */
}
if (hx >= 0x7ff00000) return x+x;
k += (hx>>20)-1023;
i = ((unsigned)k&0x80000000)>>31;
hx = (hx&0x000fffff)|((0x3ff-i)<<20);
y = (double)(k+i);
__setHigh(&x, hx);
z = y*log10_2lo + ivln10*log(x);
return z+y*log10_2hi;
}
OVERLOADABLE double log1p(double x)
{
double ln2_hi = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */
ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
two54 = 1.80143985094819840000e+16, /* 43500000 00000000 */
Lp1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
Lp2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
Lp3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
Lp4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
Lp5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
Lp6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
Lp7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
double hfsq,f,c,s,z,R,u,zero = 0;
int k,hx,hu,ax;
hx = __HI(x); /* high word of x */
ax = hx&0x7fffffff;
k = 1;
if (hx < 0x3FDA827A) { /* x < 0.41422 */
if(ax>=0x3ff00000) { /* x <= -1.0 */
if(x==-1.0) return -two54/zero; /* log1p(-1)=+inf */
else return (x-x)/(x-x); /* log1p(x<-1)=NaN */
}
if(ax<0x3e200000) { /* |x| < 2**-29 */
if(two54+x>zero /* raise inexact */
&&ax<0x3c900000) /* |x| < 2**-54 */
return x;
else
return x - x*x*0.5;
}
if(hx>0||hx<=((int)0xbfd2bec3)) {
k=0;f=x;hu=1;} /* -0.2929= 0x7ff00000) return x+x;
if(k!=0) {
if(hx<0x43400000) {
u = 1.0+x;
hu = __HI(u); /* high word of u */
k = (hu>>20)-1023;
c = (k>0)? 1.0-(u-x):x-(u-1.0);/* correction term */
c /= u;
} else {
u = x;
hu = __HI(u); /* high word of u */
k = (hu>>20)-1023;
c = 0;
}
hu &= 0x000fffff;
if(hu<0x6a09e) {
__setHigh(&u, hu|0x3ff00000); /* normalize u */
} else {
k += 1;
__setHigh(&u, hu|0x3fe00000); /* normalize u/2 */
hu = (0x00100000-hu)>>2;
}
f = u-1.0;
}
hfsq=0.5*f*f;
if(hu==0) { /* |f| < 2**-20 */
if(f==zero)
{
if(k==0) return zero;
else {c += k*ln2_lo; return k*ln2_hi+c;}
}
R = hfsq*(1.0-0.66666666666666666*f);
if(k==0) return f-R; else
return k*ln2_hi-((R-(k*ln2_lo+c))-f);
}
s = f/(2.0+f);
z = s*s;
R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
if(k==0) return f-(hfsq-s*(hfsq+R)); else
return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
}
OVERLOADABLE double logb(double x)
{
int lx,ix;
ix = (__HI(x))&0x7fffffff; /* high |x| */
lx = __LO(x); /* low x */
if((ix|lx)==0) return -1.0/fabs(x);
if(ix>=0x7ff00000) return x*x;
if((ix>>=20)==0) /* IEEE 754 logb */
{
long qx = as_long(x);
qx = qx & DF_ABS_MASK;
int msbOne = clz(qx);
return (double)(-1022 - (53 -(64 -msbOne)));
}
else
return (double) (ix-1023);
}
OVERLOADABLE int ilogb(double x)
{
int hx,lx,ix;
hx = (__HI(x))&0x7fffffff; /* high word of x */
if(hx == 0x7ff00000 && __LO(x) == 0) return 0x7fffffff;
if(hx<0x00100000) {
lx = __LO(x);
if((hx|lx)==0)
return 0x80000000; /* ilogb(0) = 0x80000000 */
else /* subnormal x */
if(hx==0) {
for (ix = -1043; lx>0; lx<<=1) ix -=1;
} else {
for (ix = -1022,hx<<=11; hx>0; hx<<=1) ix -=1;
}
return ix;
}
else if (hx<0x7ff00000) return (hx>>20)-1023;
else return 0x80000000;
}
OVERLOADABLE double lgamma(double x)
{
double two52= 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
halfD= 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
/* tt = -(tail of tf) */
tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
double zero= 0.00000000000000000000e+00;
double t,y,z,nadj,p,p1,p2,p3,q,r,w;
int i,hx,lx,ix;
hx = __HI(x);
lx = __LO(x);
/* purge off +-inf, NaN, +-0, and negative arguments */
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) return x*x;
if((ix|lx)==0) return one/zero;
if(ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */
if(hx<0) {
return -log(-x);
} else return -log(x);
}
if(hx<0) {
if(ix>=0x43300000) /* |x|>=2**52, must be -integer */
return one/zero;
t = sinpi(x);
if(t==zero) return one/zero; /* -integer */
nadj = log(pi/fabs(t*x));
x = -x;
}
/* purge off 1 and 2 */
if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
/* for x < 2.0 */
else if(ix<0x40000000) {
if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -log(x);
if(ix>=0x3FE76944) {y = one-x; i= 0;}
else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
else {y = x; i=2;}
} else {
r = zero;
if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
else {y=x-one;i=2;}
}
switch(i) {
case 0:
z = y*y;
p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
p = y*p1+p2;
r += (p-0.5*y); break;
case 1:
z = y*y;
w = z*y;
p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */
p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
p = z*p1-(tt-w*(p2+y*p3));
r += (tf + p); break;
case 2:
p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
r += (-0.5*y + p1/p2);
}
}
else if(ix<0x40200000) { /* x < 8.0 */
i = (int)x;
t = zero;
y = x-(double)i;
p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
r = halfD*y+p/q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch(i) {
case 7: z *= (y+6.0); /* FALLTHRU */
case 6: z *= (y+5.0); /* FALLTHRU */
case 5: z *= (y+4.0); /* FALLTHRU */
case 4: z *= (y+3.0); /* FALLTHRU */
case 3: z *= (y+2.0); /* FALLTHRU */
r += log(z); break;
}
/* 8.0 <= x < 2**58 */
} else if (ix < 0x43900000) {
t = log(x);
z = one/x;
y = z*z;
w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
r = (x-halfD)*(t-one)+w;
} else
/* 2**58 <= x <= inf */
r = x*(log(x)-one);
if(hx<0) r = nadj - r;
return r;
}
CONST OVERLOADABLE double __gen_ocl_mad(double a, double b, double c) __asm("llvm.fma" ".f64");
OVERLOADABLE double mad(double a, double b, double c)
{
return __gen_ocl_mad(a, b, c);
}
OVERLOADABLE double nan(ulong code)
{
return as_double(DF_POSITIVE_INF + (code&DF_MAN_MASK));
}
OVERLOADABLE double nextafter(double x, double y)
{
long hx, hy, ix, iy;
hx = as_long(x);
hy = as_long(y);
ix = hx & DF_ABS_MASK;
iy = hy & DF_ABS_MASK;
if(ix>DF_POSITIVE_INF|| iy>DF_POSITIVE_INF)
return x+y;
if(hx == hy)
return y;
if(ix == 0) {
if(iy == 0)
return y;
else
return as_double((hy&DF_SIGN_MASK) | 1);
}
if(hx >= 0) {
if(hx > hy) {
hx -= 1;
} else {
hx += 1;
}
} else {
if(hy >= 0 || hx > hy){
hx -= 1;
} else {
hx += 1;
}
}
return as_double(hx);
}
OVERLOADABLE double fmax(double a, double b)
{
ulong ua = as_ulong(a);
ulong ub =as_ulong(b);
if((ua & DF_ABS_MASK) > DF_MAX_NORMAL) return b;
if((ub & DF_ABS_MASK) > DF_MAX_NORMAL) return a;
if(ua == DF_POSITIVE_INF) return a;
if(ub == DF_POSITIVE_INF) return b;
double c = a - b;
return (c >= 0) ? a:b;
}
OVERLOADABLE double fmin(double a, double b)
{
ulong ua = as_ulong(a);
ulong ub =as_ulong(b);
if((ua & DF_ABS_MASK) > DF_MAX_NORMAL) return b;
if((ub & DF_ABS_MASK) > DF_MAX_NORMAL) return a;
if(ua == DF_NEGTIVE_INF) return a;
if(ub == DF_NEGTIVE_INF) return b;
double c = a - b;
return (c <= 0) ? a:b;
}
OVERLOADABLE double fmod (double x, double y)
{
const double one = 1.0, Zero[] = {0.0, -0.0,};
int n,hx,hy,hz,ix,iy,sx,i;
uint lx,ly,lz;
hx = __HI(x);
lx = __LO(x);
hy = __HI(y);
ly = __LO(y);
sx = hx&0x80000000; /* sign of x */
hx ^=sx; /* |x| */
hy &= 0x7fffffff; /* |y| */
/* purge off exception values */
if((hy|ly)==0||(hx>=0x7ff00000)|| /* y=0,or x not finite */
((hy|((ly|-ly)>>31))>0x7ff00000)) /* or y is NaN */
return (x*y)/(x*y);
if(hx<=hy) {
if((hx>31]; /* |x|=|y| return x*0*/
}
/* determine ix = ilogb(x) */
if(hx<0x00100000) { /* subnormal x */
if(hx==0) {
for (ix = -1043, i=lx; i>0; i<<=1) ix -=1;
} else {
for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1;
}
} else ix = (hx>>20)-1023;
/* determine iy = ilogb(y) */
if(hy<0x00100000) { /* subnormal y */
if(hy==0) {
for (iy = -1043, i=ly; i>0; i<<=1) iy -=1;
} else {
for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1;
}
} else iy = (hy>>20)-1023;
/* set up {hx,lx}, {hy,ly} and align y to x */
if(ix >= -1022)
hx = 0x00100000|(0x000fffff&hx);
else { /* subnormal x, shift x to normal */
n = -1022-ix;
if(n<=31) {
hx = (hx<>(32-n));
lx <<= n;
} else {
hx = lx<<(n-32);
lx = 0;
}
}
if(iy >= -1022)
hy = 0x00100000|(0x000fffff&hy);
else { /* subnormal y, shift y to normal */
n = -1022-iy;
if(n<=31) {
hy = (hy<>(32-n));
ly <<= n;
} else {
hy = ly<<(n-32);
ly = 0;
}
}
/* fix point fmod */
n = ix - iy;
while(n--) {
hz=hx-hy;lz=lx-ly; if(lx>31); lx = lx+lx;}
else {
if((hz|lz)==0) /* return sign(x)*0 */
return Zero[(uint)sx>>31];
hx = hz+hz+(lz>>31); lx = lz+lz;
}
}
hz=hx-hy;lz=lx-ly; if(lx=0) {hx=hz;lx=lz;}
/* convert back to floating value and restore the sign */
if((hx|lx)==0) /* return sign(x)*0 */
return Zero[(uint)sx>>31];
while(hx<0x00100000) { /* normalize x */
hx = hx+hx+(lx>>31); lx = lx+lx;
iy -= 1;
}
if(iy>= -1022) { /* normalize output */
hx = ((hx-0x00100000)|((iy+1023)<<20));
__setHigh(&x,hx|sx);
__setLow(&x, lx);
} else { /* subnormal output */
n = -1022 - iy;
if(n<=20) {
lx = (lx>>n)|((uint)hx<<(32-n));
hx >>= n;
} else if (n<=31) {
lx = (hx<<(32-n))|(lx>>n); hx = sx;
} else {
lx = hx>>(n-32); hx = sx;
}
__setHigh(&x,hx|sx);
__setLow(&x, lx);
x *= one; /* create necessary signal */
}
return x; /* exact output */
}
OVERLOADABLE double __ocl_internal_pow(double x, double y)
{
double bp[] = {1.0, 1.5,},
dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */
dp_l[] = { 0.0, 1.35003920212974897128e-08,}, /* 0x3E4CFDEB, 0x43CFD006 */
zero = 0.0,
one = 1.0,
two = 2.0,
two53 = 9007199254740992.0, /* 0x43400000, 0x00000000 */
huge = 1.0e300,
tiny = 1.0e-300,
/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
L1 = 5.99999999999994648725e-01, /* 0x3FE33333, 0x33333303 */
L2 = 4.28571428578550184252e-01, /* 0x3FDB6DB6, 0xDB6FABFF */
L3 = 3.33333329818377432918e-01, /* 0x3FD55555, 0x518F264D */
L4 = 2.72728123808534006489e-01, /* 0x3FD17460, 0xA91D4101 */
L5 = 2.30660745775561754067e-01, /* 0x3FCD864A, 0x93C9DB65 */
L6 = 2.06975017800338417784e-01, /* 0x3FCA7E28, 0x4A454EEF */
P1 = 1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */
P2 = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */
P3 = 6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */
P4 = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */
P5 = 4.13813679705723846039e-08, /* 0x3E663769, 0x72BEA4D0 */
lg2 = 6.93147180559945286227e-01, /* 0x3FE62E42, 0xFEFA39EF */
lg2_h = 6.93147182464599609375e-01, /* 0x3FE62E43, 0x00000000 */
lg2_l = -1.90465429995776804525e-09, /* 0xBE205C61, 0x0CA86C39 */
ovt = 8.0085662595372944372e-0017, /* -(1024-log2(ovfl+.5ulp)) */
cp = 9.61796693925975554329e-01, /* 0x3FEEC709, 0xDC3A03FD =2/(3ln2) */
cp_h = 9.61796700954437255859e-01, /* 0x3FEEC709, 0xE0000000 =(float)cp */
cp_l = -7.02846165095275826516e-09, /* 0xBE3E2FE0, 0x145B01F5 =tail of cp_h*/
ivln2 = 1.44269504088896338700e+00, /* 0x3FF71547, 0x652B82FE =1/ln2 */
ivln2_h = 1.44269502162933349609e+00, /* 0x3FF71547, 0x60000000 =24b 1/ln2*/
ivln2_l = 1.92596299112661746887e-08; /* 0x3E54AE0B, 0xF85DDF44 =1/ln2 tail*/
double z,ax,z_h,z_l,p_h,p_l;
double y1,t1,t2,r,s,t,u,v,w;
int i0,i1,i,j,k,yisint,n;
int hx,hy,ix,iy;
unsigned lx,ly;
hx = __HI(x); lx = __LO(x);
hy = __HI(y); ly = __LO(y);
ix = hx&0x7fffffff; iy = hy&0x7fffffff;
i0 = ((*(int*)&one)>>29)^1; i1=1-i0;
/* determine if y is an odd int when x < 0
* yisint = 0 ... y is not an integer
* yisint = 1 ... y is an odd int
* yisint = 2 ... y is an even int
*/
yisint = 0;
if(hx<0) {
if(iy>=0x43400000) yisint = 2; /* even integer y */
else if(iy>=0x3ff00000) {
k = (iy>>20)-0x3ff; /* exponent */
if(k>20) {
j = ly>>(52-k);
if((j<<(52-k))==ly) yisint = 2-(j&1);
} else if(ly==0) {
j = iy>>(20-k);
if((j<<(20-k))==iy) yisint = 2-(j&1);
}
}
}
/* special value of y */
if(ly==0) {
if (iy==0x7ff00000) { /* y is +-inf */
if(((ix-0x3ff00000)|lx)==0)
return 1.0; /* inf**+-1 is NaN */
else if (ix >= 0x3ff00000)/* (|x|>1)**+-inf = inf,0 */
return (hy>=0)? y: zero;
else /* (|x|<1)**-,+inf = inf,0 */
return (hy<0)?-y: zero;
}
if(iy==0x3ff00000) { /* y is +-1 */
if(hy<0) return one/x; else return x;
}
if(hy==0x40000000) return x*x; /* y is 2 */
if(hy==0x3fe00000) { /* y is 0.5 */
if(hx>=0) /* x >= +0 */
return sqrt(x);
}
}
ax = fabs(x);
/* special value of x */
if(lx==0) {
if(ix==0x7ff00000||ix==0||ix==0x3ff00000){
z = ax; /*x is +-0,+-inf,+-1*/
if(hy<0) z = one/z; /* z = (1/|x|) */
if(hx<0) {
if(((ix-0x3ff00000)|yisint)==0) {
z = (z-z)/(z-z); /* (-1)**non-int is NaN */
} else if(yisint==1)
z = -z; /* (x<0)**odd = -(|x|**odd) */
}
return z;
}
}
n = (hx>>31)+1;
/* (x<0)**(non-int) is NaN */
if((n|yisint)==0) return (x-x)/(x-x);
s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
if((n|(yisint-1))==0) s = -one;/* (-ve)**(odd int) */
/* |y| is huge */
if(iy>0x41e00000) { /* if |y| > 2**31 */
if(iy>0x43f00000){ /* if |y| > 2**64, must o/uflow */
if(ix<=0x3fefffff) return (hy<0)? huge*huge:tiny*tiny;
if(ix>=0x3ff00000) return (hy>0)? huge*huge:tiny*tiny;
}
/* over/underflow if x is not close to one */
if(ix<0x3fefffff) return (hy<0)? s*huge*huge:s*tiny*tiny;
if(ix>0x3ff00000) return (hy>0)? s*huge*huge:s*tiny*tiny;
/* now |1-x| is tiny <= 2**-20, suffice to compute
log(x) by x-x^2/2+x^3/3-x^4/4 */
t = ax-one; /* t has 20 trailing zeros */
w = (t*t)*(0.5-t*(0.3333333333333333333333-t*0.25));
u = ivln2_h*t; /* ivln2_h has 21 sig. bits */
v = t*ivln2_l-w*ivln2;
t1 = u+v;
__setLow(&t1, 0);
t2 = v-(t1-u);
} else {
double ss,s2,s_h,s_l,t_h,t_l;
n = 0;
/* take care subnormal number */
if(ix<0x00100000)
{ax *= two53; n -= 53; ix = __HI(ax); }
n += ((ix)>>20)-0x3ff;
j = ix&0x000fffff;
/* determine interval */
ix = j|0x3ff00000; /* normalize ix */
if(j<=0x3988E) k=0; /* |x|>1)|0x20000000)+0x00080000+(k<<18));
t_l = ax - (t_h-bp[k]);
s_l = v*((u-s_h*t_h)-s_h*t_l);
/* compute log(ax) */
s2 = ss*ss;
r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
r += s_l*(s_h+ss);
s2 = s_h*s_h;
t_h = 3.0+s2+r;
__setLow(&t_h, 0);
t_l = r-((t_h-3.0)-s2);
/* u+v = ss*(1+...) */
u = s_h*t_h;
v = s_l*t_h+t_l*ss;
/* 2/(3log2)*(ss+...) */
p_h = u+v;
__setLow(&p_h, 0);
p_l = v-(p_h-u);
z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */
z_l = cp_l*p_h+p_l*cp+dp_l[k];
/* log2(ax) = (ss+..)*2/(3*log2) = n + dp_h + z_h + z_l */
t = (double)n;
t1 = (((z_h+z_l)+dp_h[k])+t);
__setLow(&t1, 0);
t2 = z_l-(((t1-t)-dp_h[k])-z_h);
}
/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
y1 = y;
__setLow(&y1, 0);
p_l = (y-y1)*t1+y*t2;
p_h = y1*t1;
z = p_l+p_h;
j = __HI(z);
i = __LO(z);
if (j>=0x40900000) { /* z >= 1024 */
if(((j-0x40900000)|i)!=0) /* if z > 1024 */
return s*huge*huge; /* overflow */
else {
if(p_l+ovt>z-p_h) return s*huge*huge; /* overflow */
}
} else if((j&0x7fffffff)>=0x4090cc00 ) { /* z <= -1075 */
if(((j-0xc090cc00)|i)!=0) /* z < -1075 */
return s*tiny*tiny; /* underflow */
else {
if(p_l<=z-p_h) return s*tiny*tiny; /* underflow */
}
}
/*
* compute 2**(p_h+p_l)
*/
i = j&0x7fffffff;
k = (i>>20)-0x3ff;
n = 0;
if(i>0x3fe00000) { /* if |z| > 0.5, set n = [z+0.5] */
n = j+(0x00100000>>(k+1));
k = ((n&0x7fffffff)>>20)-0x3ff; /* new k for n */
t = zero;
__setHigh(&t, (n&~(0x000fffff>>k)));
n = ((n&0x000fffff)|0x00100000)>>(20-k);
if(j<0) n = -n;
p_h -= t;
}
t = p_l+p_h;
__setLow(&t, 0);
u = t*lg2_h;
v = (p_l-(t-p_h))*lg2+t*lg2_l;
z = u+v;
w = v-(z-u);
t = z*z;
t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
r = (z*t1)/(t1-two)-(w+z*w);
z = one-(r-z);
j = __HI(z);
j += (n<<20);
if((j>>20)<=0) z = __scalbn(z,n); /* subnormal output */
else __setHigh(&z, __HI(z)+(n<<20));
return s*z;
}
OVERLOADABLE double pow(double x, double y)
{
int hx,hy,ix,iy;
unsigned lx,ly;
hx = __HI(x); lx = __LO(x);
hy = __HI(y); ly = __LO(y);
ix = hx&0x7fffffff; iy = hy&0x7fffffff;
/* y==zero: x**0 = 1 */
if((iy|ly)==0) return 1.0;
/* +-NaN return x+y */
if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)))
return x+y;
if(iy > 0x7ff00000 || ((iy==0x7ff00000)&&(ly!=0)))
{
if(x == 1.0)
return x;
else
return x + y;
}
return __ocl_internal_pow(x, y);
}
OVERLOADABLE double pown(double x, int n)
{
int hx,hy,ix,iy;
unsigned lx,ly;
hx = __HI(x); lx = __LO(x);
ix = hx&0x7fffffff;
/* y==zero: x**0 = 1 */
if(n ==0) return 1.0;
/* +-NaN return x+y */
if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)))
return x+n;
return __ocl_internal_pow(x, n);
}
OVERLOADABLE double powr(double x, double y)
{
if( x < 0.0L )
return as_double(DF_POSITIVE_INF + 1);
if( isnan(x) || isnan(y) )
return x + y;
if( x == 1.0L )
{
if(fabs(y) == INFINITY )
return as_double(DF_POSITIVE_INF + 1);
return 1.0L;
}
if( y == 0.0L )
{
if( x == 0.0L || x == INFINITY )
return as_double(DF_POSITIVE_INF + 1);
return 1.0L;
}
if( x == 0.0L )
{
if( y < 0.0L )
return INFINITY;
return 0.0L;
}
return __ocl_internal_pow(x, y);
}
OVERLOADABLE double remainder(double x, double p)
{
int hx,hp;
unsigned sx,lx,lp;
double p_half, zero = 0.0;
hx = __HI(x); /* high word of x */
lx = __LO(x); /* low word of x */
hp = __HI(p); /* high word of p */
lp = __LO(p); /* low word of p */
sx = hx&0x80000000;
hp &= 0x7fffffff;
hx &= 0x7fffffff;
/* purge off exception values */
if((hp|lp)==0) return (x*p)/(x*p); /* p = 0 */
if((hx>=0x7ff00000)|| /* x not finite */
((hp>=0x7ff00000)&& /* p is NaN */
(((hp-0x7ff00000)|lp)!=0)))
return (x*p)/(x*p);
if (hp<=0x7fdfffff) x = fmod(x,p+p); /* now x < 2p */
if (((hx-hp)|(lx-lp))==0) return zero*x;
x = fabs(x);
p = fabs(p);
if (hp<0x00200000) {
if(x+x>p) {
x-=p;
if(x+x>=p) x -= p;
}
} else {
p_half = 0.5*p;
if(x>p_half) {
x-=p;
if(x>=p_half) x -= p;
}
}
__setHigh(&x, __HI(x) ^sx);
return x;
}
OVERLOADABLE double rint(double x)
{
long ret;
long lval = as_long(x);
int exp = ((lval & DF_EXP_MASK) >> DF_EXP_OFFSET) - DF_EXP_BIAS;
long sign = (lval & DF_SIGN_MASK)?1:0;
long ma = (lval &DF_MAN_MASK);
if((lval & DF_ABS_MASK) == 0)
return as_double(sign << 63);
if(exp < -1)
{
ret = ((sign << 63)) ;
return as_double(ret);
}
if(exp > 51) return x;
long i = (1L << (52 - exp));
i = 0x10000000000000 - i;
unsigned long uv = 0xFFF0000000000000 + i;
ulong vv = lval & uv;
double dp = as_double(vv);
if(exp == -1) dp = 0;
long fval = ma | DF_IMPLICITE_ONE;
long lastBit = (fval & (1L << (52 -exp)));
long roundBits = (fval & ( (1L << (52 -exp)) -1));
if(roundBits > (1L << (51 -exp)))
dp = (sign) ? dp-1.0:dp+1.0;
else if((roundBits == (1L << (51 -exp))) && lastBit)
dp = (sign) ? dp-1.0:dp+1.0;
return dp;
}
OVERLOADABLE double round(double x)
{
long ret;
long lval = as_long(x);
int exp = ((lval & DF_EXP_MASK) >> DF_EXP_OFFSET) - DF_EXP_BIAS;
long sign = (lval & DF_SIGN_MASK)?1:0;
long ma = (lval &DF_MAN_MASK);
if((lval & DF_ABS_MASK) == 0)
return as_double(sign << 63);
if(exp < -1)
{
ret = ((sign << 63)) ;
return as_double(ret);
}
if(exp > 51) return x;
long i = (1L << (52 - exp));
i = 0x10000000000000 - i;
unsigned long uv = 0xFFF0000000000000 + i;
ulong vv = lval & uv;
double dp = as_double(vv);
if(exp == -1) dp = 0;
long fval = ma | DF_IMPLICITE_ONE;
long roundBits = (fval & ( (1L << (52 -exp)) -1));
if(roundBits > (1L << (51 -exp)))
dp = (sign) ? dp-1.0:dp+1.0;
else if(roundBits == (1L << (51 -exp)))
dp = (sign) ? dp-1.0:dp+1.0;
return dp;
}
OVERLOADABLE double rootn(double x, int n)
{
double ax,re;
ulong ux = as_ulong(x);
ulong sign = ux&DF_SIGN_MASK;
if( n == 0 )return as_double(DF_POSITIVE_INF+1);
if((ux&DF_ABS_MASK)== 0)
{
int neg = (n & 0x80000000);
int odd = (n & 1);
if(neg && odd)
return as_double(sign|DF_POSITIVE_INF);
if(neg && !odd)
return as_double(DF_POSITIVE_INF);
if(!neg && !odd)
return 0.0;
if(!neg && odd)
return as_double(sign);
}
if( x < 0 && ! (n&1) )
return as_double(DF_POSITIVE_INF+1);
if((ux&DF_ABS_MASK) == DF_POSITIVE_INF)
{
if(n < 0)
return 0.0;
return x;
}
if(n == 1)
return x;
if(n == 2)
return sqrt(x);
if(n == 3)
return cbrt(x);
if(n == -1)
return 1.0/x;
if(n == -2)
return rsqrt(x);
if(n == -3)
return 1.0/cbrt(x);
if(x > 0.0f)
re = exp10(log10(x)/(double)n);
else
re = -exp10(log10(-x)/(double)n);
return re;
}
OVERLOADABLE double rsqrt(double x)
{
return 1.0/sqrt(x);
}
OVERLOADABLE double sin(double x)
{
double y[2],z=0.0;
int n, ix;
/* High word of x. */
ix = __HI(x);
/* |x| ~< pi/4 */
ix &= 0x7fffffff;
if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
/* sin(Inf or NaN) is NaN */
else if (ix>=0x7ff00000) return x-x;
/* argument reduction needed */
else {
n = __ieee754_rem_pio2(x,y);
switch(n&3) {
case 0: return __kernel_sin(y[0],y[1],1);
case 1: return __kernel_cos(y[0],y[1]);
case 2: return -__kernel_sin(y[0],y[1],1);
default:
return -__kernel_cos(y[0],y[1]);
}
}
}
OVERLOADABLE double sinh(double x)
{
double one = 1.0, shuge = 1.0e307;
double t,w,h;
int ix,jx;
unsigned lx;
/* High word of |x|. */
jx = __HI(x);
ix = jx&0x7fffffff;
/* x is INF or NaN */
if(ix>=0x7ff00000) return x+x;
h = 0.5;
if (jx<0) h = -h;
/* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
if (ix < 0x40360000) { /* |x|<22 */
if (ix<0x3e300000) /* |x|<2**-28 */
if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
t = expm1(fabs(x));
if(ix<0x3ff00000) return h*(2.0*t-t*t/(t+one));
return h*(t+t/(t+one));
}
/* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
if (ix < 0x40862E42) return h*exp(fabs(x));
/* |x| in [log(maxdouble), overflowthresold] */
lx = *( (((*(unsigned*)&one)>>29)) + (unsigned*)&x);
if (ix<0x408633CE || ((ix==0x408633ce)&&(lx<=(unsigned)0x8fb9f87d))) {
w = exp(0.5*fabs(x));
t = h*w;
return t*w;
}
/* |x| > overflowthresold, sinh(x) overflow */
return x*shuge;
}
OVERLOADABLE double sinpi(double x)
{
double y,z, signVaule = 1.0;
int ix;
ix = 0x7fffffff&__HI(x);
if(ix<0x3fd00000) return __kernel_sin(M_PI*x,0, 0);
y = fabs(x);
if(ix >=0x7ff00000)return as_double(DF_POSITIVE_INF + 1);
z = floor(y);
if(z!=y)
{ /* inexact anyway */
y = y -z;
if(y > 0.5)
{
y = y - 0.5;
y = 0.5 - y;
}
if(z > 0)
{
ulong uz = as_ulong(z);
int expValue = ((uz & DF_EXP_MASK) >> DF_EXP_OFFSET) - DF_EXP_BIAS;
ulong manValue = ((uz & DF_MAN_MASK) | DF_IMPLICITE_ONE);
if(expValue > 52) return 1.0;
manValue = (manValue >> (52 - expValue));
if(manValue & 1)
signVaule = -1.0;
}
}
else
{
return copysign(0.0, x);
}
y = sin(y*M_PI);
return signVaule*copysign(y, x);
}
OVERLOADABLE double sqrt(double x)
{
double z;
int sign = (int)0x80000000;
unsigned r,t1,s1,ix1,q1;
int ix0,s0,q,m,t,i;
const double one = 1.0, tiny=1.0e-300;
ix0 = __HI(x); /* high word of x */
ix1 = __LO(x); /* low word of x */
/* take care of Inf and NaN */
if((ix0&0x7ff00000)==0x7ff00000) {
return x*x+x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf
sqrt(-inf)=sNaN */
}
/* take care of zero */
if(ix0<=0) {
if(((ix0&(~sign))|ix1)==0) return x;/* sqrt(+-0) = +-0 */
else if(ix0<0)
return (x-x)/(x-x); /* sqrt(-ve) = sNaN */
}
/* normalize x */
m = (ix0>>20);
if(m==0) { /* subnormal x */
while(ix0==0) {
m -= 21;
ix0 |= (ix1>>11); ix1 <<= 21;
}
for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;
m -= i-1;
ix0 |= (ix1>>(32-i));
ix1 <<= i;
}
m -= 1023; /* unbias exponent */
ix0 = (ix0&0x000fffff)|0x00100000;
if(m&1){ /* odd m, double x to make it even */
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
}
m >>= 1; /* m = [m/2] */
/* generate sqrt(x) bit by bit */
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */
r = 0x00200000; /* r = moving bit from right to left */
while(r!=0) {
t = s0+r;
if(t<=ix0) {
s0 = t+r;
ix0 -= t;
q += r;
}
ix0 += ix0 + ((ix1&sign)>>31);
ix1 += ix1;
r>>=1;
}
r = sign;
while(r!=0) {
t1 = s1+r;
t = s0;
if((t>31);
ix1 += ix1;
r>>=1;
}
/* use floating add to find out rounding direction */
if((ix0|ix1)!=0) {
z = one-tiny; /* trigger inexact flag */
if (z>=one) {
z = one+tiny;
if (q1==(unsigned)0xffffffff) { q1=0; q += 1;}
else if (z>one) {
if (q1==(unsigned)0xfffffffe) q+=1;
q1+=2;
} else
q1 += (q1&1);
}
}
ix0 = (q>>1)+0x3fe00000;
ix1 = q1>>1;
if ((q&1)==1) ix1 |= sign;
ix0 += (m <<20);
__setHigh(&z, ix0);
__setLow(&z, ix1);
return z;
}
double __kernel_tan(double x, double y, int iy)
{
const double xxx[] = {
3.33333333333334091986e-01, /* 3FD55555, 55555563 */
1.33333333333201242699e-01, /* 3FC11111, 1110FE7A */
5.39682539762260521377e-02, /* 3FABA1BA, 1BB341FE */
2.18694882948595424599e-02, /* 3F9664F4, 8406D637 */
8.86323982359930005737e-03, /* 3F8226E3, E96E8493 */
3.59207910759131235356e-03, /* 3F6D6D22, C9560328 */
1.45620945432529025516e-03, /* 3F57DBC8, FEE08315 */
5.88041240820264096874e-04, /* 3F4344D8, F2F26501 */
2.46463134818469906812e-04, /* 3F3026F7, 1A8D1068 */
7.81794442939557092300e-05, /* 3F147E88, A03792A6 */
7.14072491382608190305e-05, /* 3F12B80F, 32F0A7E9 */
-1.85586374855275456654e-05, /* BEF375CB, DB605373 */
2.59073051863633712884e-05, /* 3EFB2A70, 74BF7AD4 */
/* one */ 1.00000000000000000000e+00, /* 3FF00000, 00000000 */
/* pio4 */ 7.85398163397448278999e-01, /* 3FE921FB, 54442D18 */
/* pio4lo */ 3.06161699786838301793e-17 /* 3C81A626, 33145C07 */
};
#define one xxx[13]
#define pio4 xxx[14]
#define pio4lo xxx[15]
#define T xxx
double z, r, v, w, s;
int ix, hx;
hx = __HI(x); /* high word of x */
ix = hx & 0x7fffffff; /* high word of |x| */
if (ix < 0x3e300000) { /* x < 2**-28 */
if ((int) x == 0) { /* generate inexact */
if (((ix | __LO(x)) | (iy + 1)) == 0)
return one / fabs(x);
else {
if (iy == 1)
return x;
else { /* compute -1 / (x+y) carefully */
double a, t;
z = w = x + y;
__setLow(&z, 0);
v = y - (z - x);
t = a = -one / w;
__setLow(&t, 0);
s = one + t * z;
return t + a * (s + t * v);
}
}
}
}
if (ix >= 0x3FE59428) { /* |x| >= 0.6744 */
if (hx < 0) {
x = -x;
y = -y;
}
z = pio4 - x;
w = pio4lo - y;
x = z + w;
y = 0.0;
}
z = x * x;
w = z * z;
/*
* Break x^5*(T[1]+x^2*T[2]+...) into
* x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
* x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
*/
r = T[1] + w * (T[3] + w * (T[5] + w * (T[7] + w * (T[9] +
w * T[11]))));
v = z * (T[2] + w * (T[4] + w * (T[6] + w * (T[8] + w * (T[10] +
w * T[12])))));
s = z * x;
r = y + z * (s * (r + v) + y);
r += T[0] * s;
w = x + r;
if (ix >= 0x3FE59428) {
v = (double) iy;
return (double) (1 - ((hx >> 30) & 2)) *
(v - 2.0 * (x - (w * w / (w + v) - r)));
}
if (iy == 1)
return w;
else {
/*
* if allow error up to 2 ulp, simply return
* -1.0 / (x+r) here
*/
/* compute -1.0 / (x+r) accurately */
double a, t;
z = w;
__setLow(&z, 0);
v = r - (z - x); /* z+v = r+x */
t = a = -1.0 / w; /* a = -1.0/w */
__setLow(&t, 0);
s = 1.0 + t * z;
return t + a * (s + t * v);
}
#undef one
#undef pio4
#undef pio4lo
#undef T
}
OVERLOADABLE double tan(double x)
{
double y[2],z=0.0;
int n, ix;
/* High word of x. */
ix = __HI(x);
/* |x| ~< pi/4 */
ix &= 0x7fffffff;
if(ix <= 0x3fe921fb) return __kernel_tan(x,z,1);
/* tan(Inf or NaN) is NaN */
else if (ix>=0x7ff00000) return x-x; /* NaN */
/* argument reduction needed */
else {
n = __ieee754_rem_pio2(x,y);
return __kernel_tan(y[0],y[1],1-((n&1)<<1)); /* 1 -- n even
-1 -- n odd */
}
}
OVERLOADABLE double tanh(double x)
{
double one=1.0, two=2.0, tiny = 1.0e-300;
double t,z;
int jx,ix;
/* High word of |x|. */
jx = __HI(x);
ix = jx&0x7fffffff;
/* x is INF or NaN */
if(ix>=0x7ff00000) {
if (jx>=0) return one/x+one; /* tanh(+-inf)=+-1 */
else return one/x-one; /* tanh(NaN) = NaN */
}
/* |x| < 22 */
if (ix < 0x40360000) { /* |x|<22 */
if (ix<0x3c800000) /* |x|<2**-55 */
return x*(one+x); /* tanh(small) = small */
if (ix>=0x3ff00000) { /* |x|>=1 */
t = expm1(two*fabs(x));
z = one - two/(t+two);
} else {
t = expm1(-two*fabs(x));
z= -t/(t+two);
}
/* |x| > 22, return +-1 */
} else {
z = one - tiny; /* raised inexact flag */
}
return (jx>=0)? z: -z;
}
OVERLOADABLE double tanpi(double x)
{
double y,z, signValue, infsign;
ulong lx;
lx = as_ulong(x);
y = fabs(x);
signValue = (lx&DF_SIGN_MASK) ? -1.0:1.0;
infsign = signValue;
if((lx&DF_ABS_MASK) >=DF_POSITIVE_INF)return as_double(DF_POSITIVE_INF + 1);
z = floor(y);
if(z > 0)
{
ulong uz = as_ulong(z);
ulong expValue = ((uz & DF_EXP_MASK) >> DF_EXP_OFFSET) - DF_EXP_BIAS;
ulong manValue = ((uz & DF_MAN_MASK) | DF_IMPLICITE_ONE);
if(expValue > 52) copysign(0.0, x);
manValue = (manValue >> (52 - expValue));
if(manValue & 1)
infsign = signValue * -1.0;
}
if(z!=y)
{
y = y -z;
if(y > 0.5)
{
y = y -0.5;
if(y < 0.25)
return -1.0*signValue /tan(y*M_PI);
y = 0.5 - y;
return -1.0*signValue*tan(y*M_PI);
}
if((y < 0.5) && (y > 0.25))
{
y = 0.5 - y;
return signValue/tan(y*M_PI);
}
if(y == 0.5)
{
if(infsign == 1.0)
return as_double(DF_POSITIVE_INF);
else
return as_double(DF_NEGTIVE_INF);
}
}
else
{
return copysign(0.0, x);
}
y = tan(y*M_PI);
return copysign(y, x);
}
OVERLOADABLE double tgamma(double x)
{
double two52= 4.50359962737049600000e+15, /* 0x43300000, 0x00000000 */
halfD= 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
one = 1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
pi = 3.14159265358979311600e+00, /* 0x400921FB, 0x54442D18 */
a0 = 7.72156649015328655494e-02, /* 0x3FB3C467, 0xE37DB0C8 */
a1 = 3.22467033424113591611e-01, /* 0x3FD4A34C, 0xC4A60FAD */
a2 = 6.73523010531292681824e-02, /* 0x3FB13E00, 0x1A5562A7 */
a3 = 2.05808084325167332806e-02, /* 0x3F951322, 0xAC92547B */
a4 = 7.38555086081402883957e-03, /* 0x3F7E404F, 0xB68FEFE8 */
a5 = 2.89051383673415629091e-03, /* 0x3F67ADD8, 0xCCB7926B */
a6 = 1.19270763183362067845e-03, /* 0x3F538A94, 0x116F3F5D */
a7 = 5.10069792153511336608e-04, /* 0x3F40B6C6, 0x89B99C00 */
a8 = 2.20862790713908385557e-04, /* 0x3F2CF2EC, 0xED10E54D */
a9 = 1.08011567247583939954e-04, /* 0x3F1C5088, 0x987DFB07 */
a10 = 2.52144565451257326939e-05, /* 0x3EFA7074, 0x428CFA52 */
a11 = 4.48640949618915160150e-05, /* 0x3F07858E, 0x90A45837 */
tc = 1.46163214496836224576e+00, /* 0x3FF762D8, 0x6356BE3F */
tf = -1.21486290535849611461e-01, /* 0xBFBF19B9, 0xBCC38A42 */
/* tt = -(tail of tf) */
tt = -3.63867699703950536541e-18, /* 0xBC50C7CA, 0xA48A971F */
t0 = 4.83836122723810047042e-01, /* 0x3FDEF72B, 0xC8EE38A2 */
t1 = -1.47587722994593911752e-01, /* 0xBFC2E427, 0x8DC6C509 */
t2 = 6.46249402391333854778e-02, /* 0x3FB08B42, 0x94D5419B */
t3 = -3.27885410759859649565e-02, /* 0xBFA0C9A8, 0xDF35B713 */
t4 = 1.79706750811820387126e-02, /* 0x3F9266E7, 0x970AF9EC */
t5 = -1.03142241298341437450e-02, /* 0xBF851F9F, 0xBA91EC6A */
t6 = 6.10053870246291332635e-03, /* 0x3F78FCE0, 0xE370E344 */
t7 = -3.68452016781138256760e-03, /* 0xBF6E2EFF, 0xB3E914D7 */
t8 = 2.25964780900612472250e-03, /* 0x3F6282D3, 0x2E15C915 */
t9 = -1.40346469989232843813e-03, /* 0xBF56FE8E, 0xBF2D1AF1 */
t10 = 8.81081882437654011382e-04, /* 0x3F4CDF0C, 0xEF61A8E9 */
t11 = -5.38595305356740546715e-04, /* 0xBF41A610, 0x9C73E0EC */
t12 = 3.15632070903625950361e-04, /* 0x3F34AF6D, 0x6C0EBBF7 */
t13 = -3.12754168375120860518e-04, /* 0xBF347F24, 0xECC38C38 */
t14 = 3.35529192635519073543e-04, /* 0x3F35FD3E, 0xE8C2D3F4 */
u0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
u1 = 6.32827064025093366517e-01, /* 0x3FE4401E, 0x8B005DFF */
u2 = 1.45492250137234768737e+00, /* 0x3FF7475C, 0xD119BD6F */
u3 = 9.77717527963372745603e-01, /* 0x3FEF4976, 0x44EA8450 */
u4 = 2.28963728064692451092e-01, /* 0x3FCD4EAE, 0xF6010924 */
u5 = 1.33810918536787660377e-02, /* 0x3F8B678B, 0xBF2BAB09 */
v1 = 2.45597793713041134822e+00, /* 0x4003A5D7, 0xC2BD619C */
v2 = 2.12848976379893395361e+00, /* 0x40010725, 0xA42B18F5 */
v3 = 7.69285150456672783825e-01, /* 0x3FE89DFB, 0xE45050AF */
v4 = 1.04222645593369134254e-01, /* 0x3FBAAE55, 0xD6537C88 */
v5 = 3.21709242282423911810e-03, /* 0x3F6A5ABB, 0x57D0CF61 */
s0 = -7.72156649015328655494e-02, /* 0xBFB3C467, 0xE37DB0C8 */
s1 = 2.14982415960608852501e-01, /* 0x3FCB848B, 0x36E20878 */
s2 = 3.25778796408930981787e-01, /* 0x3FD4D98F, 0x4F139F59 */
s3 = 1.46350472652464452805e-01, /* 0x3FC2BB9C, 0xBEE5F2F7 */
s4 = 2.66422703033638609560e-02, /* 0x3F9B481C, 0x7E939961 */
s5 = 1.84028451407337715652e-03, /* 0x3F5E26B6, 0x7368F239 */
s6 = 3.19475326584100867617e-05, /* 0x3F00BFEC, 0xDD17E945 */
r1 = 1.39200533467621045958e+00, /* 0x3FF645A7, 0x62C4AB74 */
r2 = 7.21935547567138069525e-01, /* 0x3FE71A18, 0x93D3DCDC */
r3 = 1.71933865632803078993e-01, /* 0x3FC601ED, 0xCCFBDF27 */
r4 = 1.86459191715652901344e-02, /* 0x3F9317EA, 0x742ED475 */
r5 = 7.77942496381893596434e-04, /* 0x3F497DDA, 0xCA41A95B */
r6 = 7.32668430744625636189e-06, /* 0x3EDEBAF7, 0xA5B38140 */
w0 = 4.18938533204672725052e-01, /* 0x3FDACFE3, 0x90C97D69 */
w1 = 8.33333333333329678849e-02, /* 0x3FB55555, 0x5555553B */
w2 = -2.77777777728775536470e-03, /* 0xBF66C16C, 0x16B02E5C */
w3 = 7.93650558643019558500e-04, /* 0x3F4A019F, 0x98CF38B6 */
w4 = -5.95187557450339963135e-04, /* 0xBF4380CB, 0x8C0FE741 */
w5 = 8.36339918996282139126e-04, /* 0x3F4B67BA, 0x4CDAD5D1 */
w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */
double zero= 0.00000000000000000000e+00;
double t,y,z,nadj,p,p1,p2,p3,q,r,w;
int i,hx,lx,ix;
hx = __HI(x);
lx = __LO(x);
/* purge off +-inf, NaN, +-0, and negative arguments */
ix = hx&0x7fffffff;
if(ix>=0x7ff00000) return x*x;
if((ix|lx)==0) return one/zero;
if(ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */
if(hx<0) {
return -log(-x);
} else return -log(x);
}
if(hx<0) {
if(ix>=0x43300000) /* |x|>=2**52, must be -integer */
return one/zero;
t = sinpi(x);
if(t==zero) return one/zero; /* -integer */
nadj = log(pi/fabs(t*x));
x = -x;
}
/* purge off 1 and 2 */
if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
/* for x < 2.0 */
else if(ix<0x40000000) {
if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -log(x);
if(ix>=0x3FE76944) {y = one-x; i= 0;}
else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
else {y = x; i=2;}
} else {
r = zero;
if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
else {y=x-one;i=2;}
}
switch(i) {
case 0:
z = y*y;
p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
p = y*p1+p2;
r += (p-0.5*y); break;
case 1:
z = y*y;
w = z*y;
p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */
p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
p = z*p1-(tt-w*(p2+y*p3));
r += (tf + p); break;
case 2:
p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
r += (-0.5*y + p1/p2);
}
}
else if(ix<0x40200000) { /* x < 8.0 */
i = (int)x;
t = zero;
y = x-(double)i;
p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
r = halfD*y+p/q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch(i) {
case 7: z *= (y+6.0); /* FALLTHRU */
case 6: z *= (y+5.0); /* FALLTHRU */
case 5: z *= (y+4.0); /* FALLTHRU */
case 4: z *= (y+3.0); /* FALLTHRU */
case 3: z *= (y+2.0); /* FALLTHRU */
r += log(z); break;
}
/* 8.0 <= x < 2**58 */
} else if (ix < 0x43900000) {
t = log(x);
z = one/x;
y = z*z;
w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
r = (x-halfD)*(t-one)+w;
} else
/* 2**58 <= x <= inf */
r = x*(log(x)-one);
if(hx<0) r = nadj - r;
return exp(r);
}
OVERLOADABLE double trunc(double x)
{
double ret = floor(fabs(x));
return copysign(ret, x);
}