diff options
author | David Schleef <ds@schleef.org> | 2004-09-03 21:39:10 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2004-09-03 21:39:10 +0000 |
commit | f735872cfb3fe56aa711b6af772bf7789ac0e377 (patch) | |
tree | ac42a34402fb902b4db098ef47a1bc312b6d72c1 /liboil/dct | |
parent | 27d1241537a0974712dd7a7027dd94b0c76aeb30 (diff) | |
download | liboil-f735872cfb3fe56aa711b6af772bf7789ac0e377.tar.gz |
Move a bunch of files around.
* configure.ac:
* liboil/Makefile.am:
* liboil/build_prototypes.c: (main):
* liboil/copy/Makefile.am:
* liboil/copy/copy.h:
* liboil/copy/permute.c:
* liboil/copy/splat_ref.c: (splat_u8_ref), (splat_u32_ref),
(splat_u32_unroll2):
* liboil/copy/tablelookup_ref.c: (tablelookup_u8_ref):
* liboil/copy/trans8x8.c: (TEST_trans8x8_f64):
* liboil/copy/trans8x8_f32.c: (trans8x8_f32_ref),
(trans4x4_f32_a16_altivec), (trans8x8_f32_a16_altivec),
(TEST_trans8x8_f32):
* liboil/copy/trans8x8_s16.c: (trans8x8_s16_ref),
(trans8x8_s16_a16_altivec), (trans8x8_s16_altivecwrap),
(TEST_trans8x8_s16):
* liboil/dct/Makefile.am:
* liboil/dct/dct.h:
* liboil/dct/dct12_f32.c: (dct12_f32_ref), (dct12_f32_ref1),
(dct12_f32_mpglib), (TEST_dct12_f32):
* liboil/dct/dct36.c: (dct36):
* liboil/dct/dct36_f32.c: (dct36_f32_ref), (TEST_dct36_f32):
* liboil/dct/fdct8_f64.c: (fdct8_f64_ref), (fdct8_f64_fast),
(TEST_fdct8_f64):
* liboil/dct/fdct8x8_f64.c: (fdct8x8_f64_ref), (fdct8x8_f64_ref2),
(fdct8x8_f64_1d), (TEST_fdct8x8_f64):
* liboil/dct/fdct8x8_s16.c: (fdct8x8_s16_ref), (TEST_fdct8x8_s16):
* liboil/dct/fdct8x8s_s16.c: (fdct8x8s_s16_ref),
(TEST_fdct8x8s_s16):
* liboil/dct/idct8_f64.c: (idct8_f64_ref), (idct8_f64_fastx),
(TEST_idct8_f64):
* liboil/dct/idct8x8_c.c: (idct8x8_f64_slow), (idct8x8_f64_c),
(idct8x8_s16_slow):
* liboil/dct/idct8x8_f64.c: (idct8x8_f64_ref), (idct8x8_f64_ref2),
(idct8x8_f64_1d), (TEST_idct8x8_f64):
* liboil/dct/idct8x8_s16.c: (idct8x8_s16_ref), (idct8x8_s16_fast),
(TEST_idct8x8_s16):
* liboil/dct/idct8x8s_s16.c: (idct8x8s_s16_ref),
(TEST_idct8x8s_s16):
* liboil/dct/imdct32_f32.c: (imdct32_f32_ref),
(imdct32_f32_mpglib), (TEST_imdct32_f32):
* liboil/jpeg/Makefile.am:
* liboil/jpeg/idct8_c.c:
* liboil/jpeg/idct8x8_c.c:
* liboil/jpeg/jpeg.c:
* liboil/jpeg/jpeg.h:
* liboil/jpeg/jpeg_rgb_decoder.c:
* liboil/jpeg/quantize8x8_c.c:
* liboil/jpeg/yuv2rgb_c.c:
* liboil/jpeg/zigzag8x8_c.c:
* liboil/liboilcpu.c: (oil_cpu_i386_getflags):
* liboil/liboildebug.c: (oil_debug_print_valist):
* liboil/liboilfuncs.h:
* liboil/liboilfunction.c: (oil_class_get_by_index),
(oil_class_optimize), (oil_init_pointers), (oil_init_structs):
* liboil/liboilfunction.h:
* liboil/simdpack/Makefile.am:
* liboil/simdpack/abs.c: (abs_u8_s8_ref), (abs_u16_s16_ref),
(abs_u32_s32_ref):
* liboil/simdpack/abs_u32_s32.c:
* liboil/simdpack/average2_u8.c:
* liboil/simdpack/clip_ref.c:
* liboil/simdpack/dct12_f32.c:
* liboil/simdpack/dct36.c:
* liboil/simdpack/dct36_f32.c:
* liboil/simdpack/diffsquaresum_f64.c:
* liboil/simdpack/downsample1x_f64.c:
* liboil/simdpack/fdct8_f64.c:
* liboil/simdpack/fdct8x8_f64.c:
* liboil/simdpack/fdct8x8_s16.c:
* liboil/simdpack/fdct8x8s_s16.c:
* liboil/simdpack/get8x8_f64.c:
* liboil/simdpack/idct8_f64.c:
* liboil/simdpack/idct8x8_f64.c:
* liboil/simdpack/idct8x8_s16.c:
* liboil/simdpack/idct8x8s_s16.c:
* liboil/simdpack/imdct32_f32.c:
* liboil/simdpack/mix_u8.c:
* liboil/simdpack/mult8x8_s16.c:
* liboil/simdpack/multsum.c:
* liboil/simdpack/permute.c:
* liboil/simdpack/sad8x8.c:
* liboil/simdpack/scalaradd.c:
* liboil/simdpack/simdpack.c:
* liboil/simdpack/sincos_f64.c:
* liboil/simdpack/squaresum_f64.c:
* liboil/simdpack/sum_f64.c:
* liboil/simdpack/trans8x8.c:
* liboil/simdpack/trans8x8_f32.c:
* liboil/simdpack/trans8x8_s16.c:
* liboil/simdpack/vectoradd_f64.c:
* liboil/simdpack/zigzag8x8_s16.c:
* testsuite/Makefile.am:
* testsuite/abs.c: (test), (main):
* testsuite/introspect.c: (main):
Diffstat (limited to 'liboil/dct')
-rw-r--r-- | liboil/dct/Makefile.am | 23 | ||||
-rw-r--r-- | liboil/dct/dct.h | 33 | ||||
-rw-r--r-- | liboil/dct/dct12_f32.c | 283 | ||||
-rw-r--r-- | liboil/dct/dct36.c | 114 | ||||
-rw-r--r-- | liboil/dct/dct36_f32.c | 86 | ||||
-rw-r--r-- | liboil/dct/fdct8_f64.c | 236 | ||||
-rw-r--r-- | liboil/dct/fdct8x8_f64.c | 192 | ||||
-rw-r--r-- | liboil/dct/fdct8x8_s16.c | 126 | ||||
-rw-r--r-- | liboil/dct/fdct8x8s_s16.c | 138 | ||||
-rw-r--r-- | liboil/dct/idct8_f64.c | 172 | ||||
-rw-r--r-- | liboil/dct/idct8x8_c.c | 118 | ||||
-rw-r--r-- | liboil/dct/idct8x8_f64.c | 206 | ||||
-rw-r--r-- | liboil/dct/idct8x8_s16.c | 144 | ||||
-rw-r--r-- | liboil/dct/idct8x8s_s16.c | 148 | ||||
-rw-r--r-- | liboil/dct/imdct32_f32.c | 438 |
15 files changed, 2457 insertions, 0 deletions
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am new file mode 100644 index 0000000..5c223a4 --- /dev/null +++ b/liboil/dct/Makefile.am @@ -0,0 +1,23 @@ + +noinst_LTLIBRARIES = libdct.la + +libdct_la_SOURCES = \ + dct12_f32.c \ + dct36_f32.c \ + fdct8_f64.c \ + idct8_f64.c \ + idct8x8_c.c \ + imdct32_f32.c + +# fdct8x8_f64.c +# fdct8x8s_s16.c +# idct8x8_f64.c +# idct8x8_s16.c +# idct8x8s_s16.c + +noinst_HEADERS = \ + dct.h + +libdct_la_CFLAGS = $(LIBOIL_CFLAGS) + + diff --git a/liboil/dct/dct.h b/liboil/dct/dct.h new file mode 100644 index 0000000..04435a1 --- /dev/null +++ b/liboil/dct/dct.h @@ -0,0 +1,33 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifndef _LIBOIL_DCT_H_ +#define _LIBOIL_DCT_H_ + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS(dct12_f32); +OIL_DECLARE_CLASS(dct36_f32); +OIL_DECLARE_CLASS(fdct8_f64); +OIL_DECLARE_CLASS(fdct8x8_f64); +OIL_DECLARE_CLASS(fdct8x8s_s16); +OIL_DECLARE_CLASS(idct8_f64); +OIL_DECLARE_CLASS(imdct32_f32); + +#endif + diff --git a/liboil/dct/dct12_f32.c b/liboil/dct/dct12_f32.c new file mode 100644 index 0000000..e486846 --- /dev/null +++ b/liboil/dct/dct12_f32.c @@ -0,0 +1,283 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> +#include <math.h> + + +OIL_DEFINE_CLASS_X (dct12_f32, "float *dest, int dstr, float *src, int sstr"); + +static void +dct12_f32_ref (float *dest, int dstr, float *src, int sstr) +{ + float in0,in1,in2,in3,in4,in5; + float *out2 = dest; + float *in = src; + float COS6_2 = cos(M_PI / 6.0 * 2.0); + float COS6_1 = cos(M_PI / 6.0 * 1.0); + float tfcos12[3]; + float wi[12]; + int i; + + for(i=0;i<3;i++){ + tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 ); + } + for(i=0;i<12;i++){ + wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 ); + } + + out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0; + + in5 = in[5*3]; + in5 += (in4 = in[4*3]); + in4 += (in3 = in[3*3]); + in3 += (in2 = in[2*3]); + in2 += (in1 = in[1*3]); + in1 += (in0 = in[0*3]); + + in5 += in3; in3 += in1; + + in2 *= COS6_1; + in3 *= COS6_1; + + { + float tmp0,tmp1 = (in0 - in4); + { + float tmp2 = (in1 - in5) * tfcos12[1]; + tmp0 = tmp1 + tmp2; + tmp1 -= tmp2; + } + out2[11-1] = tmp0 * wi[11-1]; + out2[6 +1] = tmp0 * wi[6+1]; + out2[0+1] += tmp1 * wi[1]; + out2[5-1] += tmp1 * wi[5-1]; + } + + in0 += in4 * COS6_2; + + in4 = in0 + in2; + in0 -= in2; + + in1 += in5 * COS6_2; + + in5 = (in1 + in3) * tfcos12[0]; + in1 = (in1 - in3) * tfcos12[2]; + + in3 = in4 + in5; + in4 -= in5; + + in2 = in0 + in1; + in0 -= in1; + + out2[11-0] = in2 * wi[11-0]; + out2[6 +0] = in2 * wi[6+0]; + out2[6 +2] = in3 * wi[6+2]; + out2[11-2] = in3 * wi[11-2]; + + out2[0+0] += in0 * wi[0]; + out2[5-0] += in0 * wi[5-0]; + out2[0+2] += in4 * wi[2]; + out2[5-2] += in4 * wi[5-2]; +} + +OIL_DEFINE_IMPL_REF (dct12_f32_ref, dct12_f32_class); + + +static void +dct12_f32_ref1(float *dest, int dstr, float *src, int sstr) +{ + int l,m,k; + double x; + double coeff; + double cos_s; + double wi[36]; + int i; + + for(i=0;i<12;i++){ + wi[i] = sin( M_PI / 12.0 * (i+0.5)); + } + for(;i<36;i++){ + wi[i] = 0; + } + + for(l=0;l<3;l++){ + for(m=0;m<6;m++){ + x = 0; + for(k=0;k<12;k++){ + cos_s = cos( (M_PI / 24.0) * (2 * k + 7) * + (2 * m + 1) ) / 3; + coeff = wi[k] * cos_s; + x += coeff * src[k + 6*l + 6]; + } + dest[3*m + l] = x; + } + } +} +OIL_DEFINE_IMPL (dct12_f32_ref1, dct12_f32_class); + +/* copyright: from mpglib */ +/* + * new DCT12 + */ +static void +dct12_f32_mpglib(float *dest, int dstr, float *src, int sstr) +{ + float in0,in1,in2,in3,in4,in5; + float *out2 = dest; + float *in = src; + float wi[12]; + int i; + float tmp0, tmp1, tmp2; + + float COS6_2 = cos(M_PI / 6.0 * 2.0); + float COS6_1 = cos(M_PI / 6.0 * 1.0); + float tfcos12[3]; + + for(i=0;i<3;i++){ + tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 ); + } + for(i=0;i<12;i++){ + wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 ); + } + + out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0; + + in5 = in[5*3] + in[4*3]; + in4 = in[4*3] + in[3*3]; + in3 = in[3*3] + in[2*3]; + in2 = in[2*3] + in[1*3]; + in1 = in[1*3] + in[0*3]; + in0 = in[0*3]; + + //in5 += in3; + in5 = in[5*3] + in[4*3] + in[3*3] + in[2*3]; + //in3 += in1; + in3 = in[3*3] + in[2*3] + in[1*3] + in[0*3]; + + //in2 *= COS6_1; + in2 = COS6_1 * (in[2*3] + in[1*3]); + + //in3 *= COS6_1; + in3 = COS6_1 * (in[3*3] + in[2*3] + in[1*3] + in[0*3]); + + //tmp1 = (in0 - in4); + tmp1 = in[0*3] - in[4*3] - in[3*3]; + + //tmp2 = (in1 - in5) * tfcos12[1]; + tmp2 = tfcos12[1] * (in[1*3] + in[0*3] - in[5*3] - in[4*3] - in[3*3] - in[2*3]); + + //tmp0 = tmp1 + tmp2; + tmp0 = (1 + tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) + + tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]); + + //tmp1 -= tmp2; + tmp1 = (1 - tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) - + tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]); + + out2[11-1] = tmp0 * wi[11-1]; + out2[6 +1] = tmp0 * wi[6+1]; + out2[0+1] += tmp1 * wi[1]; + out2[5-1] += tmp1 * wi[5-1]; + + in0 += in4 * COS6_2; + + in4 = in0 + in2; + in0 -= in2; + + in1 += in5 * COS6_2; + + in5 = (in1 + in3) * tfcos12[0]; + in1 = (in1 - in3) * tfcos12[2]; + + in3 = in4 + in5; + in4 = in4 - in5; + + in2 = in0 + in1; + in0 = in0 - in1; + + out2[11-0] = in2 * wi[11-0]; + out2[6 +0] = in2 * wi[6+0]; + out2[6 +2] = in3 * wi[6+2]; + out2[11-2] = in3 * wi[11-2]; + + out2[0+0] += in0 * wi[0]; + out2[5-0] += in0 * wi[5-0]; + out2[0+2] += in4 * wi[2]; + out2[5-2] += in4 * wi[5-2]; +} + +OIL_DEFINE_IMPL (dct12_f32_mpglib, dct12_f32_class); + + + +#ifdef TEST_dct12_f32 +int TEST_dct12_f32(void) +{ + int i; + int pass; + int failures = 0; + float *src, *dest_ref, *dest_test; + struct sl_profile_struct t; + + src = sl_malloc_f32(18); + dest_ref = sl_malloc_f32(18); + dest_test = sl_malloc_f32(18); + + sl_profile_init(t); + srand(20021001); + + printf("I: " sl_stringify(dct12_f32_FUNC) "\n"); + + for(pass=0;pass<1;pass++){ + for(i=0;i<18;i++){ + src[i]=sl_rand_f32_0_1(); + //src[i]=(i==pass+6); + } + + dct12_f32_ref(dest_ref,src); + sl_profile_start(t); + dct12_f32_FUNC(dest_test,src); + sl_profile_stop(t); + + for(i=0;i<18;i++){ + if(dest_test[i] != dest_ref[i]){ + printf("%d %g %g %g\n",i,src[i],dest_ref[i], + dest_test[i]); + } + } + } + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/dct36.c b/liboil/dct/dct36.c new file mode 100644 index 0000000..9cfc885 --- /dev/null +++ b/liboil/dct/dct36.c @@ -0,0 +1,114 @@ +/* + DCT insipired by Jeff Tsay's DCT from the maplay package + this is an optimized version with manual unroll. + + References: + [1] S. Winograd: "On Computing the Discrete Fourier Transform", + Mathematics of Computation, Volume 32, Number 141, January 1978, + Pages 175-199 +*/ + +static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) +{ + { + real *in = inbuf; + + in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14]; + in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11]; + in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8]; + in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5]; + in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2]; + in[2] +=in[1]; in[1] +=in[0]; + + in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9]; + in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1]; + + + { + +#define MACRO0(v) { \ + real tmp; \ + out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \ + out2[8-(v)] = tmp * w[26-(v)]; } \ + sum0 -= sum1; \ + ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \ + ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; +#define MACRO1(v) { \ + real sum0,sum1; \ + sum0 = tmp1a + tmp2a; \ + sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \ + MACRO0(v); } +#define MACRO2(v) { \ + real sum0,sum1; \ + sum0 = tmp2a - tmp1a; \ + sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ + MACRO0(v); } + + const real *c = COS9; + real *out2 = o2; + real *w = wintab; + real *out1 = o1; + real *ts = tsbuf; + + real ta33,ta66,tb33,tb66; + + ta33 = in[2*3+0] * c[3]; + ta66 = in[2*6+0] * c[6]; + tb33 = in[2*3+1] * c[3]; + tb66 = in[2*6+1] * c[6]; + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7]; + tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7]; + tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8]; + tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8]; + + MACRO1(0); + MACRO2(8); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3]; + tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3]; + tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0]; + tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1]; + + MACRO1(1); + MACRO2(7); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1]; + tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1]; + tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4]; + tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4]; + + MACRO1(2); + MACRO2(6); + } + + { + real tmp1a,tmp2a,tmp1b,tmp2b; + tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5]; + tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5]; + tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2]; + tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2]; + + MACRO1(3); + MACRO2(5); + } + + { + real sum0,sum1; + sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0]; + sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4]; + MACRO0(4); + } + } + + } +} + diff --git a/liboil/dct/dct36_f32.c b/liboil/dct/dct36_f32.c new file mode 100644 index 0000000..061c84d --- /dev/null +++ b/liboil/dct/dct36_f32.c @@ -0,0 +1,86 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> + +OIL_DEFINE_CLASS_X(dct36_f32, "float *dest, int dstr, float *src, int sstr, int n"); + +static void +dct36_f32_ref(float *dest, int dstr, float *src, int sstr, int n) +{ + int i; + for(i=0;i<n;i++){ + dest[i] = src[i]; + } +} + +OIL_DEFINE_IMPL_REF (dct36_f32_ref, dct36_f32_class); + +#ifdef TEST_dct36_f32 +int TEST_dct36_f32(void) +{ + int i; + int pass; + int failures = 0; + f64 *src, *dest_ref, *dest_test; + struct sl_profile_struct t; + + src = sl_malloc_f64(N); + dest_ref = sl_malloc_f64(N); + dest_test = sl_malloc_f64(N); + + sl_profile_init(t); + srand(20021001); + + printf("I: " sl_stringify(dct36_f32_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<N;i++)src[i]=sl_rand_f64_s16(); + + dct36_f32_ref(dest_ref,src,N); + sl_profile_start(t); + dct36_f32_FUNC(dest_test,src,N); + sl_profile_stop(t); + + for(i=0;i<N;i++){ + if(dest_test[i] != dest_ref[i]){ + printf("%d %g %g %g\n",i,src[i],dest_ref[i], + dest_test[i]); + } + } + } + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/fdct8_f64.c b/liboil/dct/fdct8_f64.c new file mode 100644 index 0000000..3fc93a0 --- /dev/null +++ b/liboil/dct/fdct8_f64.c @@ -0,0 +1,236 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> +#include <math.h> + +OIL_DEFINE_CLASS_X (fdct8_f64, "double *dest, double *src, int dstr, int sstr"); + +#define C0_9808 0.980785280 +#define C0_9239 0.923879532 +#define C0_8315 0.831469612 +#define C0_7071 0.707106781 +#define C0_5556 0.555570233 +#define C0_3827 0.382683432 +#define C0_1951 0.195090322 + +static void +fdct8_f64_ref (double *dest, double *src, int dstr, int sstr) +{ + static double fdct_coeff[8][8]; + static int fdct_coeff_init = 0; + int i,j; + double x; + + if(!fdct_coeff_init){ + double scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + fdct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + fdct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + x = 0; + for(j=0;j<8;j++){ + x += fdct_coeff[j][i] * OIL_GET(src,sstr*j, double); + } + OIL_GET(dest,dstr*i, double) = x; + } +} + +OIL_DEFINE_IMPL_REF (fdct8_f64_ref, fdct8_f64_class); + +/* + * This algorithm is roughly similar to a Fast-Fourier transform, + * taking advantage of the symmeties of the base vectors. For + * reference, the base vectors are (horizontally): + * + * 0: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 + * 1: 0.9808 0.8315 0.5556 0.1951 -0.1951 -0.5556 -0.8315 -0.9808 + * 2: 0.9239 0.3827 -0.3827 -0.9239 -0.9239 -0.3827 0.3827 0.9239 + * 3: 0.8315 -0.1951 -0.9808 -0.5556 0.5556 0.9808 0.1951 -0.8315 + * 4: 0.7071 -0.7071 -0.7071 0.7071 0.7071 -0.7071 -0.7071 0.7071 + * 5: 0.5556 -0.9808 0.1951 0.8315 -0.8315 -0.1951 0.9808 -0.5556 + * 6: 0.3827 -0.9239 0.9239 -0.3827 -0.3827 0.9239 -0.9239 0.3827 + * 7: 0.1951 -0.5556 0.8315 -0.9808 0.9808 -0.8315 0.5556 -0.1951 + * + * The symmetries of note: + * - even vectors are symmetric around 4 (the middle) + * - odd vectors are antisymmetric around 4 + * - 0,4 are symmertic around 2 and 6 + * - 2,6 are antisymmetic around 2 and 6 + * + * f0 = (x0 + x7) + (x1 + x6) + (x2 + x5) + (x3 + x4); + * f1 = 0.9808*(x0 - x7) + 0.8315*(x1 - x6) + 0.5556*(x2 - x5) + 0.1951*(x3 - x4) + * f2 = 0.9239*(x0 + x7) + 0.3827*(x1 + x6) - 0.3827*(x2 + x5) - 0.9239*(x3 + x4) + * f3 = 0.8315*(x0 - x7) - 0.1951*(x1 - x6) - 0.9808*(x2 - x5) - 0.5556*(x3 - x4) + * f4 = 0.7071*((x0 + x7) - (x1 + x6) - (x2 + x5) + (x3 + x4)) + * f5 = 0.5556*(x0 - x7) - 0.9808*(x1 - x6) + 0.1951*(x2 - x5) + 0.8315*(x3 - x4) + * f6 = 0.3827*(x0 + x7) - 0.9239*(x1 + x6) + 0.9239*(x2 + x5) - 0.3827*(x3 + x4) + * f7 = 0.1951*(x0 - x7) - 0.5556*(x1 - x6) + 0.8315*(x2 - x5) - 0.9808*(x3 - x4) + * + * The even vectors can be further simplified: + * + * f0 = ((x0 + x7) + (x3 + x4)) + ((x1 + x6) + (x2 + x5)); + * f2 = 0.9239*((x0 + x7) - (x3 + x4)) + 0.3827*((x1 + x6) - (x2 + x5)) + * f4 = 0.7071*((x0 + x7) + (x3 + x4)) - 0.7071*((x1 + x6) + (x2 + x5)) + * f6 = 0.3827*((x0 + x7) - (x3 + x4)) - 0.9239*((x1 + x6) - (x2 + x5)) + * + * Some implementations move some of the normalization to a later + * stage of processing, saving a few multiplies which get absorbed + * into later multiplies. However, this incurs a bit of error in + * the integer versions of this function. Also, if the CPU has a + * multiply-and-add function, you don't gain anything. + */ + +static void +fdct8_f64_fast(double *dest, double *src, int dstr, int sstr) +{ + double s07, s16, s25, s34; + double d07, d16, d25, d34; + double ss07s34, ds07s34, ss16s25, ds16s25; + + s07 = OIL_GET(src,sstr*0,double) + OIL_GET(src,sstr*7,double); + d07 = OIL_GET(src,sstr*0,double) - OIL_GET(src,sstr*7,double); + s16 = OIL_GET(src,sstr*1,double) + OIL_GET(src,sstr*6,double); + d16 = OIL_GET(src,sstr*1,double) - OIL_GET(src,sstr*6,double); + s25 = OIL_GET(src,sstr*2,double) + OIL_GET(src,sstr*5,double); + d25 = OIL_GET(src,sstr*2,double) - OIL_GET(src,sstr*5,double); + s34 = OIL_GET(src,sstr*3,double) + OIL_GET(src,sstr*4,double); + d34 = OIL_GET(src,sstr*3,double) - OIL_GET(src,sstr*4,double); + + ss07s34 = s07 + s34; + ds07s34 = s07 - s34; + ss16s25 = s16 + s25; + ds16s25 = s16 - s25; + + OIL_GET(dest,dstr*0,double) = 0.5*C0_7071*(ss07s34 + ss16s25); + OIL_GET(dest,dstr*2,double) = 0.5*(C0_9239*ds07s34 + C0_3827*ds16s25); + OIL_GET(dest,dstr*4,double) = 0.5*C0_7071*(ss07s34 - ss16s25); + OIL_GET(dest,dstr*6,double) = 0.5*(C0_3827*ds07s34 - C0_9239*ds16s25); + + OIL_GET(dest,dstr*1,double) = 0.5*(C0_9808*d07 + C0_8315*d16 + + C0_5556*d25 + C0_1951*d34); + OIL_GET(dest,dstr*3,double) = 0.5*(C0_8315*d07 - C0_1951*d16 + - C0_9808*d25 - C0_5556*d34); + OIL_GET(dest,dstr*5,double) = 0.5*(C0_5556*d07 - C0_9808*d16 + + C0_1951*d25 + C0_8315*d34); + OIL_GET(dest,dstr*7,double) = 0.5*(C0_1951*d07 - C0_5556*d16 + + C0_8315*d25 - C0_9808*d34); + +#if 0 + z1 = (ds1 tmp12 + tmp13) * 0.707106781; + OIL_GET(dest,dstr*2,double) = (tmp13 + z1)*(0.25*M_SQRT2)*0.765366864; + OIL_GET(dest,dstr*6,double) = (tmp13 - z1)*(0.25*M_SQRT2)*1.847759066; + + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433; + z2 = 0.541196100 * tmp10 + z5; + z4 = 1.306562965 * tmp12 + z5; + z3 = tmp11 * 0.707106781; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + OIL_GET(dest,dstr*5,double) = (z13 + z2)*(0.25*M_SQRT2)*1.2728; + OIL_GET(dest,dstr*3,double) = (z13 - z2)*(0.25*M_SQRT2)*0.8504; + OIL_GET(dest,dstr*1,double) = (z11 + z4)*(0.25*M_SQRT2)*0.7209; + OIL_GET(dest,dstr*7,double) = (z11 - z4)*(0.25*M_SQRT2)*3.6245; +#endif +} +OIL_DEFINE_IMPL (fdct8_f64_fast, fdct8_f64_class); + + + + +#ifdef TEST_fdct8_f64 +int TEST_fdct8_f64(void) +{ + int i; + int pass; + int failures = 0; + double *src, *dest_ref, *dest_test; + double sad; + double sad_sum; + double sad_max; + struct sl_profile_struct t; + + src = sl_malloc_f64(8); + dest_ref = sl_malloc_f64(8); + dest_test = sl_malloc_f64(8); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(fdct8_f64_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1(); + + //block8_dump(src); + + fdct8_f64_ref(dest_test, src, 8, 8); + //block8_dump(dest_test); + + sl_profile_start(t); + fdct8_f64_FUNC(dest_ref, src, 8, 8); + sl_profile_stop(t); + //block8_dump(dest_ref); + + sad = 0; + for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 1.0){ + failures++; + } + } + printf("sad average: %g\n",sad_sum/N_PASS); + printf("sad max: %g\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/fdct8x8_f64.c b/liboil/dct/fdct8x8_f64.c new file mode 100644 index 0000000..e59b455 --- /dev/null +++ b/liboil/dct/fdct8x8_f64.c @@ -0,0 +1,192 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilfuncs.h> +#include <liboil/dct/dct.h> +#include <math.h> + + +OIL_DEFINE_CLASS(fdct8x8_f64, NULL); + + +static void fdct8x8_f64_ref(double *dest, int dstr, double *src, int sstr) +{ + static double fdct_coeff[8][8]; + static int fdct_coeff_init = 0; + int i,j,k,l; + double tmp1,tmp2; + + if(!fdct_coeff_init){ + double scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + fdct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + fdct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + tmp1 = 0; + for(k=0;k<8;k++){ + tmp2 = 0; + for(l=0;l<8;l++){ + tmp2 += fdct_coeff[l][j] * + OIL_GET (src, sstr*k + l, double); + } + tmp1 += fdct_coeff[k][i] * tmp2; + } + OIL_GET (dest, dstr*i + j, double) = tmp1; + } + } +} + +OIL_DEFINE_IMPL_REF (fdct8x8_f64_ref, fdct8x8_f64_class); + +static void +fdct8x8_f64_ref2(double *dest, int dstr, double *src, int sstr) +{ + static double fdct_coeff[8][8]; + static int fdct_coeff_init = 0; + int i,j,k; + double x; + double tmp[64]; + + if(!fdct_coeff_init){ + double scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + fdct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + fdct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + x = 0; + for(k=0;k<8;k++){ + x += fdct_coeff[k][j] * + OIL_GET (src, sstr*i + k, double); + } + tmp[8*i+j] = x; + } + } + + for(j=0;j<8;j++){ + for(i=0;i<8;i++){ + x = 0; + for(k=0;k<8;k++){ + x += fdct_coeff[k][i] * tmp[8*k + j]; + } + OIL_GET (dest,dstr*i+j, double) = x; + } + } +} + +OIL_DEFINE_IMPL (fdct8x8_f64_ref2, fdct8x8_f64_class); + + +static void +fdct8x8_f64_1d (double *dest, int dstr, double *src, int sstr) +{ + int i; + double tmp[64]; + + for(i=0;i<8;i++){ + fdct8_f64(tmp + i*8, sizeof(double), OIL_OFFSET(src,sstr*i), + sizeof(double)); + } + + for(i=0;i<8;i++){ + fdct8_f64(dest + i, dstr, tmp + i, 8*sizeof(double)); + } +} + +OIL_DEFINE_IMPL (fdct8x8_f64_1d, fdct8x8_f64_class); + + +#ifdef TEST_fdct8x8_f64 +int TEST_fdct8x8_f64(void) +{ + int i; + int pass; + int failures = 0; + double *src, *dest_ref, *dest_test; + double sad; + double sad_sum; + double sad_max; + struct sl_profile_struct t; + + src = sl_malloc_f64(64); + dest_ref = sl_malloc_f64(64); + dest_test = sl_malloc_f64(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(fdct8x8_f64_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1(); + + fdct8x8_f64_ref(dest_test, src, 8, 8); + sl_profile_start(t); + fdct8x8_f64_FUNC(dest_ref, src, 8, 8); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 1.0){ + failures++; + } + } + printf("sad average: %g\n",sad_sum/N_PASS); + printf("sad max: %g\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/fdct8x8_s16.c b/liboil/dct/fdct8x8_s16.c new file mode 100644 index 0000000..4b8f045 --- /dev/null +++ b/liboil/dct/fdct8x8_s16.c @@ -0,0 +1,126 @@ +/* forward discrete cosine transform on 8x8 block + * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* +Kernel: fdct8x8_s16 +Description: inverse discrete cosine transform on 8x8 block + +XXX +*/ + +#ifndef _fdct8x8_s16_h_ +#define _fdct8x8_s16_h_ + +#include <math.h> + +#include <sl_types.h> +#include <sl_block8x8.h> + +/* storage class */ +#ifndef SL_fdct8x8_s16_storage + #ifdef SL_storage + #define SL_fdct8x8_s16_storage SL_storage + #else + #define SL_fdct8x8_s16_storage static inline + #endif +#endif + + + +#include <fdct8x8_f64.h> +#include <conv8x8_f64_s16.h> +/* IMPL fdct8x8_s16_ref */ +SL_fdct8x8_s16_storage +void fdct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr) +{ + f64 s[64], d[64]; + int i,j; + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + block8x8_f64(s,8*sizeof(f64),i,j) = + block8x8_s16(src,sstr,i,j); + } + } + + fdct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64)); + conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64)); +} +#endif + +#ifdef TEST_fdct8x8_s16 +int TEST_fdct8x8_s16(void) +{ + int i; + int pass; + int failures = 0; + s16 *src, *dest_ref, *dest_test; + u32 sad; + u32 sad_sum; + u32 sad_max; + struct sl_profile_struct t; + + src = sl_malloc_s16(64); + dest_ref = sl_malloc_s16(64); + dest_test = sl_malloc_s16(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(fdct8x8_s16_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<64;i++)src[i] = sl_rand_s16_l9(); + + fdct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_start(t); + fdct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 64){ + block8x8_dump_s16(src, 8*sizeof(s16)); + block8x8_dump_s16(dest_test, 8*sizeof(s16)); + block8x8_dump_s16(dest_ref, 8*sizeof(s16)); + failures++; + } + } + printf("sad average: %g\n",((double)sad_sum)/N_PASS); + printf("sad max: %d\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/fdct8x8s_s16.c b/liboil/dct/fdct8x8s_s16.c new file mode 100644 index 0000000..4176298 --- /dev/null +++ b/liboil/dct/fdct8x8s_s16.c @@ -0,0 +1,138 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> + + +OIL_DEFINE_CLASS(fdct8x8s_s16, NULL); + +#define C0_9808 0.980785280 +#define C0_9239 0.923879532 +#define C0_8315 0.831469612 +#define C0_7071 0.707106781 +#define C0_5556 0.555570233 +#define C0_3827 0.382683432 +#define C0_1951 0.195090322 + +/* +Alternate scaling used by RTjpeg. +*/ + + + +static void +fdct8x8s_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr) +{ + double s[64], d[64]; + double scale[8] = { + 4*C0_7071, + 4*C0_9808, + 4*C0_9239, + 4*C0_8315, + 4*C0_7071, + 4*C0_5556, + 4*C0_3827, + 4*C0_1951, + }; + int i,j; + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + s[8*i+j] = OIL_GET (src,sstr*i+j, int16_t); + } + } + + fdct8x8_f64(d,8*sizeof(double),s,8*sizeof(double)); + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + d[8*i+j] *= scale[i] * scale[j]; + } + } + + conv8x8_s16_f64(dest,dstr,d,8*sizeof(double)); +} + +OIL_DEFINE_IMPL_REF (fdct8x8s_s16_ref, fdct8x8s_s16_class); + +#ifdef TEST_fdct8x8s_s16 +int TEST_fdct8x8s_s16(void) +{ + int i; + int pass; + int failures = 0; + int16_t *src, *dest_ref, *dest_test; + u32 sad; + u32 sad_sum; + u32 sad_max; + struct sl_profile_struct t; + + src = sl_malloc_s16(64); + dest_ref = sl_malloc_s16(64); + dest_test = sl_malloc_s16(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(fdct8x8s_s16_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<64;i++)src[i] = sl_rand_s16_l9(); + + fdct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_start(t); + fdct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 128){ + //block8x8_dump_s16(src, 8*sizeof(s16)); + //block8x8_dump_s16(dest_ref, 8*sizeof(s16)); + //block8x8_dump_s16(dest_test, 8*sizeof(s16)); + //block8x8_dumpdiff_s16(dest_test, dest_ref, 8*sizeof(s16)); + failures++; + } + } + printf("sad average: %g\n",((double)sad_sum)/N_PASS); + printf("sad max: %d\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/idct8_f64.c b/liboil/dct/idct8_f64.c new file mode 100644 index 0000000..fa3a952 --- /dev/null +++ b/liboil/dct/idct8_f64.c @@ -0,0 +1,172 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> +#include <math.h> + +OIL_DEFINE_CLASS_X (idct8_f64, "double *dest, int dstr, double *src, int sstr"); + +#define C0_9808 0.980785280 +#define C0_9239 0.923879532 +#define C0_8315 0.831469612 +#define C0_7071 0.707106781 +#define C0_5556 0.555570233 +#define C0_3827 0.382683432 +#define C0_1951 0.195090322 + +static void idct8_f64_ref(double *dest, int dstr, double *src, int sstr) +{ + static double idct_coeff[8][8]; + static int idct_coeff_init = 0; + int i,j; + double x; + + if(!idct_coeff_init){ + double scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + idct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + idct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + x = 0; + for(j=0;j<8;j++){ + x += idct_coeff[i][j] * OIL_GET (src, sstr*j, double); + } + OIL_GET (dest, dstr*i, double) = x; + } +} + +OIL_DEFINE_IMPL_REF (idct8_f64_ref, idct8_f64_class); + + +static void idct8_f64_fastx(double *dest, int dstr, double *src, int sstr) +{ + double s07, s16, s25, s34; + double d07, d16, d25, d34; + double ss07s34, ss16s25; + double ds07s34, ds16s25; + + ss07s34 = C0_7071*(OIL_GET(src,sstr*0, double) + OIL_GET(src,sstr*4, double)); + ss16s25 = C0_7071*(OIL_GET(src,sstr*0, double) - OIL_GET(src,sstr*4, double)); + + ds07s34 = C0_9239* OIL_GET(src,sstr*2, double) + C0_3827* OIL_GET(src,sstr*6, double); + ds16s25 = C0_3827* OIL_GET(src,sstr*2, double) - C0_9239* OIL_GET(src,sstr*6, double); + + s07 = ss07s34 + ds07s34; + s34 = ss07s34 - ds07s34; + + s16 = ss16s25 + ds16s25; + s25 = ss16s25 - ds16s25; + + d07 = C0_9808* OIL_GET(src,sstr*1, double) + C0_8315* OIL_GET(src,sstr*3, double) + + C0_5556* OIL_GET(src,sstr*5, double) + C0_1951* OIL_GET(src,sstr*7, double); + d16 = C0_8315* OIL_GET(src,sstr*1, double) - C0_1951* OIL_GET(src,sstr*3, double) + - C0_9808* OIL_GET(src,sstr*5, double) - C0_5556* OIL_GET(src,sstr*7, double); + d25 = C0_5556* OIL_GET(src,sstr*1, double) - C0_9808* OIL_GET(src,sstr*3, double) + + C0_1951* OIL_GET(src,sstr*5, double) + C0_8315* OIL_GET(src,sstr*7, double); + d34 = C0_1951* OIL_GET(src,sstr*1, double) - C0_5556* OIL_GET(src,sstr*3, double) + + C0_8315* OIL_GET(src,sstr*5, double) - C0_9808* OIL_GET(src,sstr*7, double); + + OIL_GET(dest,dstr*0, double) = 0.5 * (s07 + d07); + OIL_GET(dest,dstr*1, double) = 0.5 * (s16 + d16); + OIL_GET(dest,dstr*2, double) = 0.5 * (s25 + d25); + OIL_GET(dest,dstr*3, double) = 0.5 * (s34 + d34); + OIL_GET(dest,dstr*4, double) = 0.5 * (s34 - d34); + OIL_GET(dest,dstr*5, double) = 0.5 * (s25 - d25); + OIL_GET(dest,dstr*6, double) = 0.5 * (s16 - d16); + OIL_GET(dest,dstr*7, double) = 0.5 * (s07 - d07); + +} + +OIL_DEFINE_IMPL (idct8_f64_fastx, idct8_f64_class); + + +#ifdef TEST_idct8_f64 +int TEST_idct8_f64(void) +{ + int i; + int pass; + int failures = 0; + double *src, *dest_ref, *dest_test; + double sad; + double sad_sum; + double sad_max; + struct sl_profile_struct t; + + src = sl_malloc_f64(8); + dest_ref = sl_malloc_f64(8); + dest_test = sl_malloc_f64(8); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(idct8_f64_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1(); + + //block8_dump(src); + + idct8_f64_ref(dest_test, src, 8, 8); + //block8_dump(dest_test); + + sl_profile_start(t); + idct8_f64_FUNC(dest_ref, src, 8, 8); + sl_profile_stop(t); + //block8_dump(dest_ref); + + sad = 0; + for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 1.0){ + failures++; + } + } + printf("sad average: %g\n",sad_sum/N_PASS); + printf("sad max: %g\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/idct8x8_c.c b/liboil/dct/idct8x8_c.c new file mode 100644 index 0000000..648065c --- /dev/null +++ b/liboil/dct/idct8x8_c.c @@ -0,0 +1,118 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2001,2002,2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboil.h> +#include <liboil/dct/dct.h> +#include <math.h> + +/* for bootstrapping */ +#ifndef idct8x8_f64 +extern OilFunctionClass _oil_function_idct8x8_f64_class; +#define idct8x8_f64 ((void (*)(double *dest, int dstr, double *src, int sstr)) \ + _oil_function_idct8x8_f64_class.func) +#endif + +#define BLOCK8x8_F64(ptr, stride, row, column) \ + (*((double *)((void *)ptr + stride*row) + column)) + +#define BLOCK8x8_PTR_F64(ptr, stride, row, column) \ + ((double *)((void *)ptr + stride*row) + column) + +#define BLOCK8x8_S16(ptr, stride, row, column) \ + (*((int16_t *)((void *)ptr + stride*row) + column)) + +OIL_DEFINE_CLASS_X (idct8x8_f64, "double *dest, int dstr, double *src, int sstr"); +OIL_DEFINE_CLASS_X (idct8x8_s16, "int16_t *dest, int dstr, int16_t *src, int sstr"); + +static void +idct8x8_f64_slow (double *dest, int dstr, double *src, int sstr) +{ + static double idct_coeff[8][8]; + static int idct_coeff_init = 0; + int i,j,k,l; + double tmp1,tmp2; + + if(!idct_coeff_init){ + double scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + idct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + idct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + tmp1 = 0; + for(k=0;k<8;k++){ + tmp2 = 0; + for(l=0;l<8;l++){ + tmp2 += idct_coeff[j][l] * + BLOCK8x8_F64(src,sstr,k,l); + } + tmp1 += idct_coeff[i][k] * tmp2; + } + BLOCK8x8_F64(dest,dstr,i,j) = tmp1; + } + } +} + +OIL_DEFINE_IMPL (idct8x8_f64_slow, idct8x8_f64_class); + +static void +idct8x8_f64_c (double *dest, int dstr, double *src, int sstr) +{ + int i; + double tmp[64]; + int tmpstr = 8*sizeof(double); + + for(i=0;i<8;i++){ + idct8_f64( + BLOCK8x8_PTR_F64(tmp,tmpstr,i,0), sizeof(double), + BLOCK8x8_PTR_F64(src,sstr,i,0), sizeof(double)); + } + for(i=0;i<8;i++){ + idct8_f64( + BLOCK8x8_PTR_F64(dest,dstr,0,i), dstr, + BLOCK8x8_PTR_F64(tmp,tmpstr,0,i), tmpstr); + } +} + +OIL_DEFINE_IMPL_DEPENDS (idct8x8_f64_c, idct8x8_f64_class, idct8_f64_class); + +static void +idct8x8_s16_slow (int16_t *dest, int dstr, int16_t *src, int sstr) +{ + double s[64], d[64]; + + conv8x8_f64_s16 (s,8*sizeof(double),src,sstr); + idct8x8_f64 (d,8*sizeof(double),s,8*sizeof(double)); + conv8x8_s16_f64 (dest,dstr,d,8*sizeof(double)); +} + +OIL_DEFINE_IMPL_DEPENDS (idct8x8_s16_slow, idct8x8_s16_class, + conv8x8_f64_s16_class, idct8x8_f64_class, conv8x8_s16_f64); + diff --git a/liboil/dct/idct8x8_f64.c b/liboil/dct/idct8x8_f64.c new file mode 100644 index 0000000..9e46523 --- /dev/null +++ b/liboil/dct/idct8x8_f64.c @@ -0,0 +1,206 @@ +/* inverse discrete cosine transform on 8x8 block + * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* +Kernel: idct8x8_f64 +Description: inverse discrete cosine transform on 8x8 block + +XXX +*/ + +#ifndef _idct8x8_f64_h_ +#define _idct8x8_f64_h_ + +#include <math.h> + +#include <sl_types.h> +#include <sl_block8x8.h> + +/* storage class */ +#ifndef SL_idct8x8_f64_storage + #ifdef SL_storage + #define SL_idct8x8_f64_storage SL_storage + #else + #define SL_idct8x8_f64_storage static inline + #endif +#endif + + +/* IMPL idct8x8_f64_ref */ +SL_idct8x8_f64_storage +void idct8x8_f64_ref(f64 *dest, f64 *src, int dstr, int sstr) +{ + static f64 idct_coeff[8][8]; + static int idct_coeff_init = 0; + int i,j,k,l; + f64 tmp1,tmp2; + + if(!idct_coeff_init){ + f64 scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + idct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + idct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + tmp1 = 0; + for(k=0;k<8;k++){ + tmp2 = 0; + for(l=0;l<8;l++){ + tmp2 += idct_coeff[j][l] * + block8x8_f64(src,sstr,k,l); + } + tmp1 += idct_coeff[i][k] * tmp2; + } + block8x8_f64(dest,dstr,i,j) = tmp1; + } + } +} + +/* IMPL idct8x8_f64_ref2 */ +SL_idct8x8_f64_storage +void idct8x8_f64_ref2(f64 *dest, f64 *src, int dstr, int sstr) +{ + static f64 idct_coeff[8][8]; + static int idct_coeff_init = 0; + int i,j,k; + f64 x; + f64 tmp[64]; + + if(!idct_coeff_init){ + f64 scale; + + for(i=0;i<8;i++){ + scale = (i==0) ? sqrt(0.125) : 0.5; + for(j=0;j<8;j++){ + idct_coeff[j][i] = scale * + cos((M_PI/8)*i*(j+0.5)); + } + } + idct_coeff_init = 1; + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + x = 0; + for(k=0;k<8;k++){ + x += idct_coeff[j][k] * + block8x8_f64(src,sstr,i,k); + } + tmp[8*i+j] = x; + } + } + + for(j=0;j<8;j++){ + for(i=0;i<8;i++){ + x = 0; + for(k=0;k<8;k++){ + x += idct_coeff[i][k] * tmp[8*k + j]; + } + block8x8_f64(dest,dstr,i,j) = x; + } + } +} + +#define f64_addr(base,str,i) ((f64 *)((void *)(base) + (str)*(i))) + +#include <idct8_f64.h> +/* IMPL idct8x8_f64_1d */ +SL_idct8x8_f64_storage +void idct8x8_f64_1d(f64 *dest, f64 *src, int dstr, int sstr) +{ + int i; + f64 tmp[64]; + + for(i=0;i<8;i++){ + idct8_f64_fast(tmp + i*8, f64_addr(src,sstr,i), sizeof(f64), sizeof(f64)); + } + for(i=0;i<8;i++){ + idct8_f64_fast(dest + i, tmp + i, dstr, 8*sizeof(f64)); + } +} + + +#endif + + +#ifdef TEST_idct8x8_f64 +int TEST_idct8x8_f64(void) +{ + int i; + int pass; + int failures = 0; + f64 *src, *dest_ref, *dest_test; + f64 sad; + f64 sad_sum; + f64 sad_max; + struct sl_profile_struct t; + + src = sl_malloc_f64(64); + dest_ref = sl_malloc_f64(64); + dest_test = sl_malloc_f64(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(idct8x8_f64_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1(); + + idct8x8_f64_ref(dest_test, src, 8, 8); + sl_profile_start(t); + idct8x8_f64_FUNC(dest_ref, src, 8, 8); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 1.0){ + failures++; + } + } + printf("sad average: %g\n",sad_sum/N_PASS); + printf("sad max: %g\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/idct8x8_s16.c b/liboil/dct/idct8x8_s16.c new file mode 100644 index 0000000..1e58026 --- /dev/null +++ b/liboil/dct/idct8x8_s16.c @@ -0,0 +1,144 @@ +/* forward discrete cosine transform on 8x8 block + * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* +Kernel: idct8x8_s16 +Description: inverse discrete cosine transform on 8x8 block + +XXX +*/ + +#ifndef _idct8x8_s16_h_ +#define _idct8x8_s16_h_ + +#include <math.h> + +#include <sl_types.h> +#include <sl_block8x8.h> + +/* storage class */ +#ifndef SL_idct8x8_s16_storage + #ifdef SL_storage + #define SL_idct8x8_s16_storage SL_storage + #else + #define SL_idct8x8_s16_storage static inline + #endif +#endif + + + +#include <idct8x8_f64.h> +#include <conv8x8_f64_s16.h> +/* IMPL idct8x8_s16_ref */ +SL_idct8x8_s16_storage +void idct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr) +{ + f64 s[64], d[64]; + int i,j; + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + block8x8_f64(s,8*sizeof(f64),i,j) = + block8x8_s16(src,sstr,i,j); + } + } + + idct8x8_f64_ref(d,s,8*sizeof(f64),8*sizeof(f64)); + conv8x8_f64_s16_ref(dest,d,dstr,8*sizeof(f64)); +} + +/* IMPL idct8x8_s16_fast */ +SL_idct8x8_s16_storage +void idct8x8_s16_fast(s16 *dest, s16 *src, int dstr, int sstr) +{ + f64 s[64], d[64]; + int i,j; + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + block8x8_f64(s,8*sizeof(f64),i,j) = + block8x8_s16(src,sstr,i,j); + } + } + + idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64)); + conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64)); +} +#endif + +#ifdef TEST_idct8x8_s16 +int TEST_idct8x8_s16(void) +{ + int i; + int pass; + int failures = 0; + s16 *src, *dest_ref, *dest_test; + u32 sad; + u32 sad_sum; + u32 sad_max; + struct sl_profile_struct t; + + src = sl_malloc_s16(64); + dest_ref = sl_malloc_s16(64); + dest_test = sl_malloc_s16(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(idct8x8_s16_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<64;i++)src[i] = sl_rand_s16_l9(); + + idct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_start(t); + idct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 64){ + block8x8_dump_s16(src, 8*sizeof(s16)); + block8x8_dump_s16(dest_test, 8*sizeof(s16)); + block8x8_dump_s16(dest_ref, 8*sizeof(s16)); + failures++; + } + } + printf("sad average: %g\n",((double)sad_sum)/N_PASS); + printf("sad max: %d\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/idct8x8s_s16.c b/liboil/dct/idct8x8s_s16.c new file mode 100644 index 0000000..59e9709 --- /dev/null +++ b/liboil/dct/idct8x8s_s16.c @@ -0,0 +1,148 @@ +/* inverse discrete cosine transform on 8x8 block + * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* +Kernel: idct8x8s_s16 +Description: inverse discrete cosine transform on 8x8 block + +Alternate scaling used by RTjpeg. +*/ + +#ifndef _idct8x8s_s16_h_ +#define _idct8x8s_s16_h_ + +#include <math.h> + +#include <sl_types.h> +#include <sl_block8x8.h> + +/* storage class */ +#ifndef SL_idct8x8s_s16_storage + #ifdef SL_storage + #define SL_idct8x8s_s16_storage SL_storage + #else + #define SL_idct8x8s_s16_storage static inline + #endif +#endif + + +/* extras */ +#include <rtjpeg/idct8x8s_s16.h> + + +#include <idct8x8_f64.h> +#include <conv8x8_f64_s16.h> +/* IMPL idct8x8s_s16_ref */ +SL_idct8x8s_s16_storage +void idct8x8s_s16_ref(s16 *dest, s16 *src, int dstr, int sstr) +{ + f64 s[64], d[64]; + const f64 scale[8] = { + 2.0/C0_7071, + 2.0/C0_9808, + 2.0/C0_9239, + 2.0/C0_8315, + 2.0/C0_7071, + 2.0/C0_5556, + 2.0/C0_3827, + 2.0/C0_1951, + }; + int i,j; + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + block8x8_f64(s,8*sizeof(f64),i,j) = + block8x8_s16(src,sstr,i,j); + } + } + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + block8x8_f64(s,8*sizeof(f64),i,j) *= scale[i] * scale[j]; + } + } + + idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64)); + + conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64)); +} +#endif + +#ifdef TEST_idct8x8s_s16 +int TEST_idct8x8s_s16(void) +{ + int i; + int pass; + int failures = 0; + s16 *src, *dest_ref, *dest_test; + u32 sad; + u32 sad_sum; + u32 sad_max; + struct sl_profile_struct t; + + src = sl_malloc_s16(64); + dest_ref = sl_malloc_s16(64); + dest_test = sl_malloc_s16(64); + + sl_profile_init(t); + srand(20020306); + + sad_sum = 0; + sad_max = 0; + + printf("I: " sl_stringify(idct8x8s_s16_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + //for(i=0;i<64;i++)src[i] = sl_rand_s16_l9(); + for(i=0;i<64;i++)src[i] = (i==pass)*1000; + + idct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_start(t); + idct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16)); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]); + if(sad_max<sad)sad_max = sad; + sad_sum += sad; + if(sad >= 128){ + block8x8_dump_s16(src, 8*sizeof(s16)); + block8x8_dump_s16(dest_ref, 8*sizeof(s16)); + block8x8_dump_s16(dest_test, 8*sizeof(s16)); + block8x8_dumpratio_s16(dest_test, dest_ref, 8*sizeof(s16)); + failures++; + } + } + printf("sad average: %g\n",((double)sad_sum)/N_PASS); + printf("sad max: %d\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + diff --git a/liboil/dct/imdct32_f32.c b/liboil/dct/imdct32_f32.c new file mode 100644 index 0000000..0a39950 --- /dev/null +++ b/liboil/dct/imdct32_f32.c @@ -0,0 +1,438 @@ +/* liboil - Library of Optimized Inner Loops + * Copyright (C) 2003 David A. Schleef <ds@schleef.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/dct/dct.h> +#include <math.h> + +OIL_DEFINE_CLASS_X (imdct32_f32, "float *dest, float *src"); + +static void imdct32_f32_ref (float *dest, float *src) +{ + double x; + int i,j; + double coeff; + + for(i=0;i<32;i++){ + x = 0; + for(j=0;j<32;j++){ + coeff = cos((M_PI/32)*i*(j+0.5)); + x += coeff * src[j]; + } + dest[i] = x; + } +} + +OIL_DEFINE_IMPL_REF (imdct32_f32_ref, imdct32_f32_class); + +/* from mpglib */ +/* + * Discrete Cosine Tansform (DCT) for subband synthesis + * optimized for machines with no auto-increment. + * The performance is highly compiler dependend. Maybe + * the dct64.c version for 'normal' processor may be faster + * even for Intel processors. + */ + +static void imdct32_f32_mpglib(float *dest,float *samples) +{ + static float cos64[16]; + static float cos32[16]; + static float cos16[16]; + static float cos8[16]; + static float cos4[16]; + float b1[32]; + float b2[32]; + static int done = 0; + + if(!done){ + int k; + + done = 1; + for(k=0;k<16;k++){ + cos64[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 64.0)); + cos32[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 32.0)); + cos16[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 16.0)); + cos8[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 8.0)); + cos4[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 4.0)); + } + } + + { + float *costab = cos64; + + b1[0x00] = samples[0x00] + samples[0x1F]; + b1[0x1F] = (samples[0x00] - samples[0x1F]) * costab[0x0]; + + b1[0x01] = samples[0x01] + samples[0x1E]; + b1[0x1E] = (samples[0x01] - samples[0x1E]) * costab[0x1]; + + b1[0x02] = samples[0x02] + samples[0x1D]; + b1[0x1D] = (samples[0x02] - samples[0x1D]) * costab[0x2]; + + b1[0x03] = samples[0x03] + samples[0x1C]; + b1[0x1C] = (samples[0x03] - samples[0x1C]) * costab[0x3]; + + b1[0x04] = samples[0x04] + samples[0x1B]; + b1[0x1B] = (samples[0x04] - samples[0x1B]) * costab[0x4]; + + b1[0x05] = samples[0x05] + samples[0x1A]; + b1[0x1A] = (samples[0x05] - samples[0x1A]) * costab[0x5]; + + b1[0x06] = samples[0x06] + samples[0x19]; + b1[0x19] = (samples[0x06] - samples[0x19]) * costab[0x6]; + + b1[0x07] = samples[0x07] + samples[0x18]; + b1[0x18] = (samples[0x07] - samples[0x18]) * costab[0x7]; + + b1[0x08] = samples[0x08] + samples[0x17]; + b1[0x17] = (samples[0x08] - samples[0x17]) * costab[0x8]; + + b1[0x09] = samples[0x09] + samples[0x16]; + b1[0x16] = (samples[0x09] - samples[0x16]) * costab[0x9]; + + b1[0x0A] = samples[0x0A] + samples[0x15]; + b1[0x15] = (samples[0x0A] - samples[0x15]) * costab[0xA]; + + b1[0x0B] = samples[0x0B] + samples[0x14]; + b1[0x14] = (samples[0x0B] - samples[0x14]) * costab[0xB]; + + b1[0x0C] = samples[0x0C] + samples[0x13]; + b1[0x13] = (samples[0x0C] - samples[0x13]) * costab[0xC]; + + b1[0x0D] = samples[0x0D] + samples[0x12]; + b1[0x12] = (samples[0x0D] - samples[0x12]) * costab[0xD]; + + b1[0x0E] = samples[0x0E] + samples[0x11]; + b1[0x11] = (samples[0x0E] - samples[0x11]) * costab[0xE]; + + b1[0x0F] = samples[0x0F] + samples[0x10]; + b1[0x10] = (samples[0x0F] - samples[0x10]) * costab[0xF]; + } + + + { + float *costab = cos32; + + b2[0x00] = b1[0x00] + b1[0x0F]; + b2[0x0F] = (b1[0x00] - b1[0x0F]) * costab[0]; + b2[0x01] = b1[0x01] + b1[0x0E]; + b2[0x0E] = (b1[0x01] - b1[0x0E]) * costab[1]; + b2[0x02] = b1[0x02] + b1[0x0D]; + b2[0x0D] = (b1[0x02] - b1[0x0D]) * costab[2]; + b2[0x03] = b1[0x03] + b1[0x0C]; + b2[0x0C] = (b1[0x03] - b1[0x0C]) * costab[3]; + b2[0x04] = b1[0x04] + b1[0x0B]; + b2[0x0B] = (b1[0x04] - b1[0x0B]) * costab[4]; + b2[0x05] = b1[0x05] + b1[0x0A]; + b2[0x0A] = (b1[0x05] - b1[0x0A]) * costab[5]; + b2[0x06] = b1[0x06] + b1[0x09]; + b2[0x09] = (b1[0x06] - b1[0x09]) * costab[6]; + b2[0x07] = b1[0x07] + b1[0x08]; + b2[0x08] = (b1[0x07] - b1[0x08]) * costab[7]; + + b2[0x10] = b1[0x10] + b1[0x1F]; + b2[0x1F] = (b1[0x1F] - b1[0x10]) * costab[0]; + b2[0x11] = b1[0x11] + b1[0x1E]; + b2[0x1E] = (b1[0x1E] - b1[0x11]) * costab[1]; + b2[0x12] = b1[0x12] + b1[0x1D]; + b2[0x1D] = (b1[0x1D] - b1[0x12]) * costab[2]; + b2[0x13] = b1[0x13] + b1[0x1C]; + b2[0x1C] = (b1[0x1C] - b1[0x13]) * costab[3]; + b2[0x14] = b1[0x14] + b1[0x1B]; + b2[0x1B] = (b1[0x1B] - b1[0x14]) * costab[4]; + b2[0x15] = b1[0x15] + b1[0x1A]; + b2[0x1A] = (b1[0x1A] - b1[0x15]) * costab[5]; + b2[0x16] = b1[0x16] + b1[0x19]; + b2[0x19] = (b1[0x19] - b1[0x16]) * costab[6]; + b2[0x17] = b1[0x17] + b1[0x18]; + b2[0x18] = (b1[0x18] - b1[0x17]) * costab[7]; + } + + { + float *costab = cos16; + + b1[0x00] = b2[0x00] + b2[0x07]; + b1[0x07] = (b2[0x00] - b2[0x07]) * costab[0]; + b1[0x01] = b2[0x01] + b2[0x06]; + b1[0x06] = (b2[0x01] - b2[0x06]) * costab[1]; + b1[0x02] = b2[0x02] + b2[0x05]; + b1[0x05] = (b2[0x02] - b2[0x05]) * costab[2]; + b1[0x03] = b2[0x03] + b2[0x04]; + b1[0x04] = (b2[0x03] - b2[0x04]) * costab[3]; + + b1[0x08] = b2[0x08] + b2[0x0F]; + b1[0x0F] = (b2[0x0F] - b2[0x08]) * costab[0]; + b1[0x09] = b2[0x09] + b2[0x0E]; + b1[0x0E] = (b2[0x0E] - b2[0x09]) * costab[1]; + b1[0x0A] = b2[0x0A] + b2[0x0D]; + b1[0x0D] = (b2[0x0D] - b2[0x0A]) * costab[2]; + b1[0x0B] = b2[0x0B] + b2[0x0C]; + b1[0x0C] = (b2[0x0C] - b2[0x0B]) * costab[3]; + + b1[0x10] = b2[0x10] + b2[0x17]; + b1[0x17] = (b2[0x10] - b2[0x17]) * costab[0]; + b1[0x11] = b2[0x11] + b2[0x16]; + b1[0x16] = (b2[0x11] - b2[0x16]) * costab[1]; + b1[0x12] = b2[0x12] + b2[0x15]; + b1[0x15] = (b2[0x12] - b2[0x15]) * costab[2]; + b1[0x13] = b2[0x13] + b2[0x14]; + b1[0x14] = (b2[0x13] - b2[0x14]) * costab[3]; + + b1[0x18] = b2[0x18] + b2[0x1F]; + b1[0x1F] = (b2[0x1F] - b2[0x18]) * costab[0]; + b1[0x19] = b2[0x19] + b2[0x1E]; + b1[0x1E] = (b2[0x1E] - b2[0x19]) * costab[1]; + b1[0x1A] = b2[0x1A] + b2[0x1D]; + b1[0x1D] = (b2[0x1D] - b2[0x1A]) * costab[2]; + b1[0x1B] = b2[0x1B] + b2[0x1C]; + b1[0x1C] = (b2[0x1C] - b2[0x1B]) * costab[3]; + } + + { + float cos0 = cos8[0]; + float cos1 = cos8[1]; + + b2[0x00] = b1[0x00] + b1[0x03]; + b2[0x03] = (b1[0x00] - b1[0x03]) * cos0; + b2[0x01] = b1[0x01] + b1[0x02]; + b2[0x02] = (b1[0x01] - b1[0x02]) * cos1; + + b2[0x04] = b1[0x04] + b1[0x07]; + b2[0x07] = (b1[0x07] - b1[0x04]) * cos0; + b2[0x05] = b1[0x05] + b1[0x06]; + b2[0x06] = (b1[0x06] - b1[0x05]) * cos1; + + b2[0x08] = b1[0x08] + b1[0x0B]; + b2[0x0B] = (b1[0x08] - b1[0x0B]) * cos0; + b2[0x09] = b1[0x09] + b1[0x0A]; + b2[0x0A] = (b1[0x09] - b1[0x0A]) * cos1; + + b2[0x0C] = b1[0x0C] + b1[0x0F]; + b2[0x0F] = (b1[0x0F] - b1[0x0C]) * cos0; + b2[0x0D] = b1[0x0D] + b1[0x0E]; + b2[0x0E] = (b1[0x0E] - b1[0x0D]) * cos1; + + b2[0x10] = b1[0x10] + b1[0x13]; + b2[0x13] = (b1[0x10] - b1[0x13]) * cos0; + b2[0x11] = b1[0x11] + b1[0x12]; + b2[0x12] = (b1[0x11] - b1[0x12]) * cos1; + + b2[0x14] = b1[0x14] + b1[0x17]; + b2[0x17] = (b1[0x17] - b1[0x14]) * cos0; + b2[0x15] = b1[0x15] + b1[0x16]; + b2[0x16] = (b1[0x16] - b1[0x15]) * cos1; + + b2[0x18] = b1[0x18] + b1[0x1B]; + b2[0x1B] = (b1[0x18] - b1[0x1B]) * cos0; + b2[0x19] = b1[0x19] + b1[0x1A]; + b2[0x1A] = (b1[0x19] - b1[0x1A]) * cos1; + + b2[0x1C] = b1[0x1C] + b1[0x1F]; + b2[0x1F] = (b1[0x1F] - b1[0x1C]) * cos0; + b2[0x1D] = b1[0x1D] + b1[0x1E]; + b2[0x1E] = (b1[0x1E] - b1[0x1D]) * cos1; + } + + { + float cos0 = cos4[0]; + + b1[0x00] = b2[0x00] + b2[0x01]; + b1[0x01] = (b2[0x00] - b2[0x01]) * cos0; + b1[0x02] = b2[0x02] + b2[0x03]; + b1[0x03] = (b2[0x03] - b2[0x02]) * cos0; + b1[0x02] += b1[0x03]; + + b1[0x04] = b2[0x04] + b2[0x05]; + b1[0x05] = (b2[0x04] - b2[0x05]) * cos0; + b1[0x06] = b2[0x06] + b2[0x07]; + b1[0x07] = (b2[0x07] - b2[0x06]) * cos0; + b1[0x06] += b1[0x07]; + b1[0x04] += b1[0x06]; + b1[0x06] += b1[0x05]; + b1[0x05] += b1[0x07]; + + b1[0x08] = b2[0x08] + b2[0x09]; + b1[0x09] = (b2[0x08] - b2[0x09]) * cos0; + b1[0x0A] = b2[0x0A] + b2[0x0B]; + b1[0x0B] = (b2[0x0B] - b2[0x0A]) * cos0; + b1[0x0A] += b1[0x0B]; + + b1[0x0C] = b2[0x0C] + b2[0x0D]; + b1[0x0D] = (b2[0x0C] - b2[0x0D]) * cos0; + b1[0x0E] = b2[0x0E] + b2[0x0F]; + b1[0x0F] = (b2[0x0F] - b2[0x0E]) * cos0; + b1[0x0E] += b1[0x0F]; + b1[0x0C] += b1[0x0E]; + b1[0x0E] += b1[0x0D]; + b1[0x0D] += b1[0x0F]; + + b1[0x10] = b2[0x10] + b2[0x11]; + b1[0x11] = (b2[0x10] - b2[0x11]) * cos0; + b1[0x12] = b2[0x12] + b2[0x13]; + b1[0x13] = (b2[0x13] - b2[0x12]) * cos0; + b1[0x12] += b1[0x13]; + + b1[0x14] = b2[0x14] + b2[0x15]; + b1[0x15] = (b2[0x14] - b2[0x15]) * cos0; + b1[0x16] = b2[0x16] + b2[0x17]; + b1[0x17] = (b2[0x17] - b2[0x16]) * cos0; + b1[0x16] += b1[0x17]; + b1[0x14] += b1[0x16]; + b1[0x16] += b1[0x15]; + b1[0x15] += b1[0x17]; + + b1[0x18] = b2[0x18] + b2[0x19]; + b1[0x19] = (b2[0x18] - b2[0x19]) * cos0; + b1[0x1A] = b2[0x1A] + b2[0x1B]; + b1[0x1B] = (b2[0x1B] - b2[0x1A]) * cos0; + b1[0x1A] += b1[0x1B]; + + b1[0x1C] = b2[0x1C] + b2[0x1D]; + b1[0x1D] = (b2[0x1C] - b2[0x1D]) * cos0; + b1[0x1E] = b2[0x1E] + b2[0x1F]; + b1[0x1F] = (b2[0x1F] - b2[0x1E]) * cos0; + b1[0x1E] += b1[0x1F]; + b1[0x1C] += b1[0x1E]; + b1[0x1E] += b1[0x1D]; + b1[0x1D] += b1[0x1F]; + } + + dest[ 0] = b1[0x00]; + dest[ 4] = b1[0x04]; + dest[ 8] = b1[0x02]; + dest[12] = b1[0x06]; +// dest[0x10* 0] = b1[0x01]; /* I think this is wrong */ + dest[16] = b1[0x01]; + dest[20] = b1[0x05]; + dest[24] = b1[0x03]; + dest[28] = b1[0x07]; + + b1[0x08] += b1[0x0C]; + dest[2] = b1[0x08]; + b1[0x0C] += b1[0x0a]; + dest[6] = b1[0x0C]; + b1[0x0A] += b1[0x0E]; + dest[10] = b1[0x0A]; + b1[0x0E] += b1[0x09]; + dest[14] = b1[0x0E]; + b1[0x09] += b1[0x0D]; + dest[18] = b1[0x09]; + b1[0x0D] += b1[0x0B]; + dest[22] = b1[0x0D]; + b1[0x0B] += b1[0x0F]; + dest[26] = b1[0x0B]; + dest[30] = b1[0x0F]; + + b1[0x18] += b1[0x1C]; + dest[1] = b1[0x10] + b1[0x18]; + dest[3] = b1[0x18] + b1[0x14]; + b1[0x1C] += b1[0x1a]; + dest[5] = b1[0x14] + b1[0x1C]; + dest[7] = b1[0x1C] + b1[0x12]; + b1[0x1A] += b1[0x1E]; + dest[9] = b1[0x12] + b1[0x1A]; + dest[11] = b1[0x1A] + b1[0x16]; + b1[0x1E] += b1[0x19]; + dest[13] = b1[0x16] + b1[0x1E]; + dest[15] = b1[0x1E] + b1[0x11]; + b1[0x19] += b1[0x1D]; + dest[17] = b1[0x11] + b1[0x19]; + dest[19] = b1[0x19] + b1[0x15]; + b1[0x1D] += b1[0x1B]; + dest[21] = b1[0x15] + b1[0x1D]; + dest[23] = b1[0x1D] + b1[0x13]; + b1[0x1B] += b1[0x1F]; + dest[25] = b1[0x13] + b1[0x1B]; + dest[27] = b1[0x1B] + b1[0x17]; + dest[29] = b1[0x17] + b1[0x1F]; + dest[31] = b1[0x1F]; +} + +OIL_DEFINE_IMPL (imdct32_f32_mpglib, imdct32_f32_class); + + + +#ifdef TEST_imdct32_f32 +int TEST_imdct32_f32(void) +{ + int i; + int pass; + int failures = 0; + f32 *src, *dest_ref, *dest_test; + struct sl_profile_struct t; + double sad; + double sad_max = 0; + double sad_sum = 0; + + src = sl_malloc_f32(32); + dest_ref = sl_malloc_f32(32); + dest_test = sl_malloc_f32(32); + + sl_profile_init(t); + srand(20021001); + + printf("I: " sl_stringify(imdct32_f32_FUNC) "\n"); + + for(pass=0;pass<N_PASS;pass++){ + for(i=0;i<32;i++)src[i]=sl_rand_f32_0_1(); + + imdct32_f32_ref(dest_ref,src); + sl_profile_start(t); + imdct32_f32_FUNC(dest_test,src); + sl_profile_stop(t); + + sad = 0; + for(i=0;i<32;i++){ + sad += fabs(dest_test[i] - dest_ref[i]); + } + if(sad>sad_max)sad_max = sad; + sad_sum += sad; +#if 0 + if(sad>0){ + printf("sad = %g\n",sad); + } +#endif +#if 0 + if(dest_test[i] != dest_ref[i]){ + printf("%d %g %g\n",i,dest_ref[i], dest_test[i]); + } +#endif + } + + printf("sad ave = %g\n",sad_sum/N_PASS); + printf("sad max = %g\n",sad_max); + + sl_free(src); + sl_free(dest_ref); + sl_free(dest_test); + + if(failures){ + printf("E: %d failures\n",failures); + } + + sl_profile_print(t); + + return failures; +} +#endif + |