backend: refine pow function

Now save 40% time than before (1) group many branches which deal with corner case to one branch. (2) using HW exp2 and log2 to replace some instructions pass conformance tests and utest Signed-off-by: rander.wang <rander.wang@intel.com> Reviewed-by: Yang Rong <rong.r.yang@intel.com>
author: rander <rander.wang@intel.com> 2017-06-22 17:41:17 +0800
committer: Yang Rong <rong.r.yang@intel.com> 2017-07-04 09:57:57 +0800
commit: 7e1e128818c96736a5953e9ff9a566e680bec98a (patch)
tree: 9f679eecd1078ece365b0d0ad4c44c39f5ba11d4
parent: 6fde3e082897975bacde1134c4a8096420fa5ab9 (diff)
download: beignet-7e1e128818c96736a5953e9ff9a566e680bec98a.tar.gz
1 files changed, 148 insertions, 146 deletions
diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
index b4764ee3..c0ab2514 100644
--- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
@@ -2382,7 +2382,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   float z,ax,z_h,z_l,p_h,p_l;
   float y1,t1,t2,r,s,sn,t,u,v,w;
   int i,j,k,yisint,n;
-  int hx,hy,ix,iy,is;
+  int hy,ix,iy,is;
+  unsigned int hx;
   float bp,dp_h,dp_l,
   zero    =  0.0,
   one	=  1.0,
@@ -2412,17 +2413,17 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   float retVal = 0.0f;
   bool bRet = false;
 
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  hx = as_uint(x);
   GEN_OCL_GET_FLOAT_WORD(hy,y);
   ax   = __gen_ocl_fabs(x);
   ix = as_int(ax);  iy = as_int(fabs(y));
 
-  if(iy < 0x00800000 || hx==0x3f800000)
+  if(iy < 0x00800000)
   {
      bRet = true;
      retVal = one;
   }
-  else if (ix > 0x7f800000 || iy > 0x7f800000)
+  else if (iy > 0x7f800000)
   {
        bRet = true;
        retVal = NAN;
@@ -2433,120 +2434,152 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
      * yisint = 1	... y is an odd int
      * yisint = 2	... y is an even int
      */
-  yisint  = 0;
-  if(hx<0) {
-    k = (iy>>23)-0x7f;		 /* exponent */
-    j = iy>>(23-k);
-    yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
-    yisint = (iy>=0x4b800000) ? 2:yisint;
-  }
-
-    /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-    z = ax;			/*x is +-0,+-inf,+-1*/
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
 
-    z = (hy < 0)? one/z:z;
-    z = ((hx<0) && (((ix-0x3f800000)|yisint)==0))? NAN:z;
-    z = ((hx<0) && (yisint==1))? -z:z;
+  if(hx >= 0x7f800000)
+  {
+    yisint  = 0;
+    n = (hx>>31)-1;
 
-    retVal = (bRet)? retVal:z;
-    bRet = true;
-  }
+    if (!retVal && ix > 0x7f800000)
+    {
+      bRet = true;
+      retVal = NAN;
+    }
 
-  n = ((uint)hx>>31)-1;
+    if(hx >= 0x80000000) {
+      k = (iy>>23)-0x7f;		 /* exponent */
+      j = iy>>(23-k);
+      yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint;
+      yisint = (iy>=0x4b800000) ? 2:yisint;
+    }
 
-  /* (x<0)**(non-int) is NaN */
-  if(!bRet && (n|yisint)==0)
-  {
-     bRet= true;
-     retVal = NAN;
-  }
+      /* special value of x */
+    if(ix==0x7f800000||ix==0||ix==0x3f800000){
+      z = ax;     /*x is +-0,+-inf,+-1*/
+      z = (hy < 0)? one/z:z;
+      z = (((ix-0x3f800000)|yisint)==0)? NAN:z;
+      z = (yisint==1)? -z:z;
+      retVal = (bRet)? retVal:z;
+      bRet = true;
+    }
 
-  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+    /* (x<0)**(non-int) is NaN */
+    if(!bRet && (n|yisint)==0)
+    {
+       bRet= true;
+       retVal = NAN;
+    }
 
-  /* |y| is huge */
-  if(iy>0x4d000000)
-  { /* if |y| > 2**27 */
-	/* over/underflow if x is not close to one */
-	/* special value of y */
-	float b1 = (hy>=0)? y: zero;
-	float b2 = (hy<0)?-y: zero;
-	b1 = (ix > 0x3f800000)? b1:b2;
-	retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
-	bRet = (iy==0x7f800000 && !bRet)? true: bRet;
+    if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+  }
 
+    /* special value of x */
+  if((ix&0x7fffff) == 0) {
+    if(hx == 0x3f800000)
+    {
+      retVal = one;
+      bRet = true;
+    }
 
-	b1 = (hy>0)? sn*huge*huge:0;
-	retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
-	bRet = (ix>0x3f800007 && !bRet)? true:bRet;
+    if(ix==0x7f800000||ix==0) {
+      z = ax;			/*x is +0,+inf*/
+      z = (hy < 0)? one/z:z;
+      retVal = (bRet)? retVal:z;
+      bRet = true;
+    }
+  }
 
-	/* now |1-x| is tiny <= 2**-20, suffice to compute
-	      log(x) by x-x^2/2+x^3/3-x^4/4 */
-	t = ax-1;		/* t has 20 trailing zeros */
-	w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
-	u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-	v = t*ivln2_l-w*ivln2;
-	t1 = u+v;
-	GEN_OCL_GET_FLOAT_WORD(is,t1);
-	GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-	t2 = v-(t1-u);
-  } 
-  else
+  /* |y| is not huge */
+  if(iy <= 0x4d000000)
   {
-	float s2,s_h,s_l,t_h,t_l;
-	n = 0;
-	n  += ((ix)>>23)-0x7f;
-	j  = ix&0x007fffff;
-	/* determine interval */
-	ix = j|0x3f800000; /* normalize ix */
-
-	n = (j >= 0x5db3d7)?n+1:n;
-	ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
-	k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
+    float s2,s_h,s_l,t_h,t_l;
+    n  = ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+    /* determine interval */
+    ix = j|0x3f800000; /* normalize ix */
 
-	GEN_OCL_SET_FLOAT_WORD(ax,ix);
+    if(x > 0x1.0p-6 && x < 0x1.2p7 && fabs(y) < 0x1.0p4)
+    {
+      GEN_OCL_SET_FLOAT_WORD(ax,ix);
+      t1 = n;
+      t2 = native_log2(ax);
+    }
+    else
+    {
+      n = (j >= 0x5db3d7)?n+1:n;
+      ix = (j >= 0x5db3d7)? ix - 0x00800000:ix;
+      k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1;
+
+      GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+      bp = k? 1.5:bp;
+      dp_h = k? 5.84960938e-01:dp_h;
+      dp_l = k? 1.56322085e-06:dp_l;
+
+      /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+      u = ax-bp;		/* bp[0]=1.0, bp[1]=1.5 */
+      v = one/(ax+bp);
+      s = u*v;
+      s_h = s;
+      GEN_OCL_GET_FLOAT_WORD(is,s_h);
+      GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+      /* t_h=ax+bp[k] High */
+      is = ((ix>>1)&0xfffff000)|0x20000000;
+      GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+      t_l = ax - (t_h-bp);
+      s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
+
+      /* compute log(ax) */
+      s2 = s*s;
+      r = s2*s2*(mad(s2, L2, L1));
+      r = mad(s_l, (s_h+s), r);
+      t_h = mad(s_h, s_h, 3.0f)+r;
+      GEN_OCL_GET_FLOAT_WORD(is,t_h);
+      GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+      t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
+      /* u+v = s*(1+...) */
+      u = s_h*t_h;
+      v = mad(s_l, t_h, t_l*s);
+      /* 2/(3log2)*(s+...) */
+      p_h = mad(s_h, t_h, v);
+      GEN_OCL_GET_FLOAT_WORD(is,p_h);
+      GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+      p_l = v-(mad(-s_h, t_h, p_h));
+      z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
+      /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+      t = (float)n;
+      t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
+      GEN_OCL_GET_FLOAT_WORD(is,t1);
+      GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+      t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
+    }
+  }
+  else
+  { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    /* special value of y */
+    float b1 = (hy>=0)? y: zero;
+    float b2 = (hy<0)?-y: zero;
+    b1 = (ix > 0x3f800000)? b1:b2;
+    retVal = (iy==0x7f800000 && !bRet)? b1:retVal;
+    bRet = (iy==0x7f800000 && !bRet)? true: bRet;
 
-	bp = k? 1.5:bp;
-	dp_h = k? 5.84960938e-01:dp_h;
-	dp_l = k? 1.56322085e-06:dp_l;
 
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-	u = ax-bp;		/* bp[0]=1.0, bp[1]=1.5 */
-	v = one/(ax+bp);
-	s = u*v;
-	s_h = s;
-	GEN_OCL_GET_FLOAT_WORD(is,s_h);
-	GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-	/* t_h=ax+bp[k] High */
-	is = ((ix>>1)&0xfffff000)|0x20000000;
-	GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
-	t_l = ax - (t_h-bp);
-	s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u)));
+    b1 = (hy>0)? sn*huge*huge:0;
+    retVal = (ix>0x3f800007 && !bRet)? b1:retVal;
+    bRet = (ix>0x3f800007 && !bRet)? true:bRet;
 
-	/* compute log(ax) */
-	s2 = s*s;
-	r = s2*s2*(mad(s2, L2, L1));
-	r = mad(s_l, (s_h+s), r);
-	t_h = mad(s_h, s_h, 3.0f)+r;
-	GEN_OCL_GET_FLOAT_WORD(is,t_h);
-	GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-	t_l = r-(mad(-s_h, s_h, (t_h-3.0f)));
-	/* u+v = s*(1+...) */
-	u = s_h*t_h;
-	v = mad(s_l, t_h, t_l*s);
-	/* 2/(3log2)*(s+...) */
-	p_h = mad(s_h, t_h, v);
-	GEN_OCL_GET_FLOAT_WORD(is,p_h);
-	GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-	p_l = v-(mad(-s_h, t_h, p_h));
-	z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l));
-	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-	t = (float)n;
-	t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t);
-	GEN_OCL_GET_FLOAT_WORD(is,t1);
-	GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-	t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h));
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
   }
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
@@ -2554,52 +2587,21 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
   p_l = mad((y-y1), t1, y*t2);
   p_h = y1*t1;
-  z = mad(y1, t1, p_l);
 
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if((j&0x7fffffff) >= 0x43000000 && !bRet)
+  z = mad(y1, t1, p_l);
+  if(fabs(z) >= 128.0f)
   {
-     retVal = ((j&0x7fffffff)>0x43160000)? 0.0:retVal;
-     retVal = (j > 0x43000000) ? sn*huge*huge:retVal;
-     retVal = (j == 0x43000000 && p_l+ovt>z-p_h)? sn*huge*huge:retVal;
-     retVal = (j == 0xc3160000 && p_l<=z-p_h)? 0.0:retVal;
-     bRet = (((j&0x7fffffff)>0x43160000) || (j == 0xc3160000 && p_l<=z-p_h) || retVal == sn*huge*huge)? true:false;
-  }
-
-  /*
-    * compute 2**(p_h+p_l)
-    */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
-    n = j+(0x00800000>>(k+1));
-    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-    n = ((n&0x007fffff)|0x00800000)>>(23-k);
-    n = (j<0)?-n:n;
-    p_h = mad(y1, t1, -t);
+    if(!bRet)
+    {
+       retVal = (fabs(z) > 150.0f)? 0.0:retVal;
+       retVal = (z > 128.0f) ? sn*INFINITY:retVal;
+       retVal = (z == 128.0f && p_l+ovt>z-p_h)? sn*INFINITY:retVal;
+       retVal = (z == -150.0f && p_l<=z-p_h)? 0.0:retVal;
+       bRet = ((fabs(z) > 150.0f) || (z == -150.0f && p_l<=z-p_h) || retVal == sn*INFINITY)? true:false;
+    }
   }
 
-  t = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
-
-  v = mad((p_l-(t-p_h)), lg2, t*lg2_l);
-  z = mad(t, lg2_h, v);
-  w = v-mad(-t, lg2_h, z);
-  t  = z*z;
-  t1  = mad(-t, (mad(t, P2, P1)), z);
-  r  = (z*t1)/(t1-two)-(mad(z, w, w));
-  z  = one-(r-z);
-
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-
-  if((j>>23)<=0)
-	z = 0;//__gen_ocl_scalbnf(z,n);	/* subnormal output */
-  else
-	GEN_OCL_SET_FLOAT_WORD(z,j);
+  z = exp2(p_l)*exp2(p_h);
 
   return bRet ? retVal:sn*z;
 }
author	rander <rander.wang@intel.com>	2017-06-22 17:41:17 +0800
committer	Yang Rong <rong.r.yang@intel.com>	2017-07-04 09:57:57 +0800
commit	7e1e128818c96736a5953e9ff9a566e680bec98a (patch)
tree	9f679eecd1078ece365b0d0ad4c44c39f5ba11d4
parent	6fde3e082897975bacde1134c4a8096420fa5ab9 (diff)
download	beignet-7e1e128818c96736a5953e9ff9a566e680bec98a.tar.gz