Move a bunch of files around.

* configure.ac: * liboil/Makefile.am: * liboil/build_prototypes.c: (main): * liboil/copy/Makefile.am: * liboil/copy/copy.h: * liboil/copy/permute.c: * liboil/copy/splat_ref.c: (splat_u8_ref), (splat_u32_ref), (splat_u32_unroll2): * liboil/copy/tablelookup_ref.c: (tablelookup_u8_ref): * liboil/copy/trans8x8.c: (TEST_trans8x8_f64): * liboil/copy/trans8x8_f32.c: (trans8x8_f32_ref), (trans4x4_f32_a16_altivec), (trans8x8_f32_a16_altivec), (TEST_trans8x8_f32): * liboil/copy/trans8x8_s16.c: (trans8x8_s16_ref), (trans8x8_s16_a16_altivec), (trans8x8_s16_altivecwrap), (TEST_trans8x8_s16): * liboil/dct/Makefile.am: * liboil/dct/dct.h: * liboil/dct/dct12_f32.c: (dct12_f32_ref), (dct12_f32_ref1), (dct12_f32_mpglib), (TEST_dct12_f32): * liboil/dct/dct36.c: (dct36): * liboil/dct/dct36_f32.c: (dct36_f32_ref), (TEST_dct36_f32): * liboil/dct/fdct8_f64.c: (fdct8_f64_ref), (fdct8_f64_fast), (TEST_fdct8_f64): * liboil/dct/fdct8x8_f64.c: (fdct8x8_f64_ref), (fdct8x8_f64_ref2), (fdct8x8_f64_1d), (TEST_fdct8x8_f64): * liboil/dct/fdct8x8_s16.c: (fdct8x8_s16_ref), (TEST_fdct8x8_s16): * liboil/dct/fdct8x8s_s16.c: (fdct8x8s_s16_ref), (TEST_fdct8x8s_s16): * liboil/dct/idct8_f64.c: (idct8_f64_ref), (idct8_f64_fastx), (TEST_idct8_f64): * liboil/dct/idct8x8_c.c: (idct8x8_f64_slow), (idct8x8_f64_c), (idct8x8_s16_slow): * liboil/dct/idct8x8_f64.c: (idct8x8_f64_ref), (idct8x8_f64_ref2), (idct8x8_f64_1d), (TEST_idct8x8_f64): * liboil/dct/idct8x8_s16.c: (idct8x8_s16_ref), (idct8x8_s16_fast), (TEST_idct8x8_s16): * liboil/dct/idct8x8s_s16.c: (idct8x8s_s16_ref), (TEST_idct8x8s_s16): * liboil/dct/imdct32_f32.c: (imdct32_f32_ref), (imdct32_f32_mpglib), (TEST_imdct32_f32): * liboil/jpeg/Makefile.am: * liboil/jpeg/idct8_c.c: * liboil/jpeg/idct8x8_c.c: * liboil/jpeg/jpeg.c: * liboil/jpeg/jpeg.h: * liboil/jpeg/jpeg_rgb_decoder.c: * liboil/jpeg/quantize8x8_c.c: * liboil/jpeg/yuv2rgb_c.c: * liboil/jpeg/zigzag8x8_c.c: * liboil/liboilcpu.c: (oil_cpu_i386_getflags): * liboil/liboildebug.c: (oil_debug_print_valist): * liboil/liboilfuncs.h: * liboil/liboilfunction.c: (oil_class_get_by_index), (oil_class_optimize), (oil_init_pointers), (oil_init_structs): * liboil/liboilfunction.h: * liboil/simdpack/Makefile.am: * liboil/simdpack/abs.c: (abs_u8_s8_ref), (abs_u16_s16_ref), (abs_u32_s32_ref): * liboil/simdpack/abs_u32_s32.c: * liboil/simdpack/average2_u8.c: * liboil/simdpack/clip_ref.c: * liboil/simdpack/dct12_f32.c: * liboil/simdpack/dct36.c: * liboil/simdpack/dct36_f32.c: * liboil/simdpack/diffsquaresum_f64.c: * liboil/simdpack/downsample1x_f64.c: * liboil/simdpack/fdct8_f64.c: * liboil/simdpack/fdct8x8_f64.c: * liboil/simdpack/fdct8x8_s16.c: * liboil/simdpack/fdct8x8s_s16.c: * liboil/simdpack/get8x8_f64.c: * liboil/simdpack/idct8_f64.c: * liboil/simdpack/idct8x8_f64.c: * liboil/simdpack/idct8x8_s16.c: * liboil/simdpack/idct8x8s_s16.c: * liboil/simdpack/imdct32_f32.c: * liboil/simdpack/mix_u8.c: * liboil/simdpack/mult8x8_s16.c: * liboil/simdpack/multsum.c: * liboil/simdpack/permute.c: * liboil/simdpack/sad8x8.c: * liboil/simdpack/scalaradd.c: * liboil/simdpack/simdpack.c: * liboil/simdpack/sincos_f64.c: * liboil/simdpack/squaresum_f64.c: * liboil/simdpack/sum_f64.c: * liboil/simdpack/trans8x8.c: * liboil/simdpack/trans8x8_f32.c: * liboil/simdpack/trans8x8_s16.c: * liboil/simdpack/vectoradd_f64.c: * liboil/simdpack/zigzag8x8_s16.c: * testsuite/Makefile.am: * testsuite/abs.c: (test), (main): * testsuite/introspect.c: (main):
author: David Schleef <ds@schleef.org> 2004-09-03 21:39:10 +0000
committer: David Schleef <ds@schleef.org> 2004-09-03 21:39:10 +0000
commit: f735872cfb3fe56aa711b6af772bf7789ac0e377 (patch)
tree: ac42a34402fb902b4db098ef47a1bc312b6d72c1 /liboil/dct
parent: 27d1241537a0974712dd7a7027dd94b0c76aeb30 (diff)
download: liboil-f735872cfb3fe56aa711b6af772bf7789ac0e377.tar.gz
15 files changed, 2457 insertions, 0 deletions
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am
new file mode 100644
index 0000000..5c223a4
--- /dev/null
+++ b/liboil/dct/Makefile.am
@@ -0,0 +1,23 @@
+
+noinst_LTLIBRARIES = libdct.la
+
+libdct_la_SOURCES = \
+	dct12_f32.c \
+	dct36_f32.c \
+	fdct8_f64.c \
+	idct8_f64.c \
+	idct8x8_c.c \
+	imdct32_f32.c
+
+#	fdct8x8_f64.c
+#	fdct8x8s_s16.c
+#	idct8x8_f64.c
+#	idct8x8_s16.c
+#	idct8x8s_s16.c
+
+noinst_HEADERS = \
+	dct.h
+
+libdct_la_CFLAGS = $(LIBOIL_CFLAGS)
+
+
diff --git a/liboil/dct/dct.h b/liboil/dct/dct.h
new file mode 100644
index 0000000..04435a1
--- /dev/null
+++ b/liboil/dct/dct.h
@@ -0,0 +1,33 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifndef _LIBOIL_DCT_H_
+#define _LIBOIL_DCT_H_
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS(dct12_f32);
+OIL_DECLARE_CLASS(dct36_f32);
+OIL_DECLARE_CLASS(fdct8_f64);
+OIL_DECLARE_CLASS(fdct8x8_f64);
+OIL_DECLARE_CLASS(fdct8x8s_s16);
+OIL_DECLARE_CLASS(idct8_f64);
+OIL_DECLARE_CLASS(imdct32_f32);
+
+#endif
+
diff --git a/liboil/dct/dct12_f32.c b/liboil/dct/dct12_f32.c
new file mode 100644
index 0000000..e486846
--- /dev/null
+++ b/liboil/dct/dct12_f32.c
@@ -0,0 +1,283 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+
+OIL_DEFINE_CLASS_X (dct12_f32, "float *dest, int dstr, float *src, int sstr");
+
+static void
+dct12_f32_ref (float *dest, int dstr, float *src, int sstr)
+{
+     float in0,in1,in2,in3,in4,in5;
+     float *out2 = dest;
+     float *in = src;
+     float COS6_2 = cos(M_PI / 6.0 * 2.0);
+     float COS6_1 = cos(M_PI / 6.0 * 1.0);
+     float tfcos12[3];
+     float wi[12];
+     int i;
+
+     for(i=0;i<3;i++){
+	     tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 );
+     }
+     for(i=0;i<12;i++){
+	     wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 );
+     }
+
+     out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0;
+
+             in5 = in[5*3];
+     in5 += (in4 = in[4*3]);
+     in4 += (in3 = in[3*3]);
+     in3 += (in2 = in[2*3]);
+     in2 += (in1 = in[1*3]);
+     in1 += (in0 = in[0*3]);
+                            
+     in5 += in3; in3 += in1;
+                            
+     in2 *= COS6_1;
+     in3 *= COS6_1;
+
+     {
+       float tmp0,tmp1 = (in0 - in4);
+       {
+         float tmp2 = (in1 - in5) * tfcos12[1];
+         tmp0 = tmp1 + tmp2;
+         tmp1 -= tmp2;
+       }
+       out2[11-1] = tmp0 * wi[11-1];
+       out2[6 +1] = tmp0 * wi[6+1];
+       out2[0+1] += tmp1 * wi[1];
+       out2[5-1] += tmp1 * wi[5-1];
+     }
+
+     in0 += in4 * COS6_2;
+                         
+     in4 = in0 + in2;    
+     in0 -= in2;         
+                         
+     in1 += in5 * COS6_2;
+                         
+     in5 = (in1 + in3) * tfcos12[0];
+     in1 = (in1 - in3) * tfcos12[2];
+                        
+     in3 = in4 + in5;    
+     in4 -= in5;         
+                         
+     in2 = in0 + in1;    
+     in0 -= in1;
+
+     out2[11-0] = in2 * wi[11-0];
+     out2[6 +0] = in2 * wi[6+0];
+     out2[6 +2] = in3 * wi[6+2];
+     out2[11-2] = in3 * wi[11-2];
+
+     out2[0+0] += in0 * wi[0];
+     out2[5-0] += in0 * wi[5-0];
+     out2[0+2] += in4 * wi[2];
+     out2[5-2] += in4 * wi[5-2];
+}
+
+OIL_DEFINE_IMPL_REF (dct12_f32_ref, dct12_f32_class);
+
+
+static void
+dct12_f32_ref1(float *dest, int dstr, float *src, int sstr)
+{
+	int l,m,k;
+	double x;
+	double coeff;
+	double cos_s;
+	double wi[36];
+	int i;
+
+	for(i=0;i<12;i++){
+		wi[i] = sin( M_PI / 12.0 * (i+0.5));
+	}
+	for(;i<36;i++){
+		wi[i] = 0;
+	}
+
+	for(l=0;l<3;l++){
+		for(m=0;m<6;m++){
+			x = 0;
+			for(k=0;k<12;k++){
+				cos_s = cos( (M_PI / 24.0) * (2 * k + 7) *
+						(2 * m + 1) ) / 3;
+				coeff = wi[k] * cos_s;
+				x += coeff * src[k + 6*l + 6];
+			}
+			dest[3*m + l] = x;
+		}
+	}
+}
+OIL_DEFINE_IMPL (dct12_f32_ref1, dct12_f32_class);
+
+/* copyright: from mpglib */
+/*
+ * new DCT12
+ */
+static void
+dct12_f32_mpglib(float *dest, int dstr, float *src, int sstr)
+{
+     float in0,in1,in2,in3,in4,in5;
+     float *out2 = dest;
+     float *in = src;
+     float wi[12];
+     int i;
+     float tmp0, tmp1, tmp2;
+
+     float COS6_2 = cos(M_PI / 6.0 * 2.0);
+     float COS6_1 = cos(M_PI / 6.0 * 1.0);
+     float tfcos12[3];
+
+     for(i=0;i<3;i++){
+	     tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 );
+     }
+     for(i=0;i<12;i++){
+	     wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 );
+     }
+
+     out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0;
+
+     in5 = in[5*3] + in[4*3];
+     in4 = in[4*3] + in[3*3];
+     in3 = in[3*3] + in[2*3];
+     in2 = in[2*3] + in[1*3];
+     in1 = in[1*3] + in[0*3];
+     in0 = in[0*3];
+                            
+     //in5 += in3;
+     in5 = in[5*3] + in[4*3] + in[3*3] + in[2*3];
+     //in3 += in1;
+     in3 = in[3*3] + in[2*3] + in[1*3] + in[0*3];
+                            
+     //in2 *= COS6_1;
+     in2 = COS6_1 * (in[2*3] + in[1*3]);
+
+     //in3 *= COS6_1;
+     in3 = COS6_1 * (in[3*3] + in[2*3] + in[1*3] + in[0*3]);
+
+     //tmp1 = (in0 - in4);
+     tmp1 = in[0*3] - in[4*3] - in[3*3];
+
+     //tmp2 = (in1 - in5) * tfcos12[1];
+     tmp2 = tfcos12[1] * (in[1*3] + in[0*3] - in[5*3] - in[4*3] - in[3*3] - in[2*3]);
+
+     //tmp0 = tmp1 + tmp2;
+     tmp0 = (1 + tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) +
+             tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]);
+
+     //tmp1 -= tmp2;
+     tmp1 = (1 - tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) -
+             tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]);
+
+     out2[11-1] = tmp0 * wi[11-1];
+     out2[6 +1] = tmp0 * wi[6+1];
+     out2[0+1] += tmp1 * wi[1];
+     out2[5-1] += tmp1 * wi[5-1];
+
+     in0 += in4 * COS6_2;
+                         
+     in4 = in0 + in2;
+     in0 -= in2;
+                 
+     in1 += in5 * COS6_2;
+                        
+     in5 = (in1 + in3) * tfcos12[0];
+     in1 = (in1 - in3) * tfcos12[2];
+                       
+     in3 = in4 + in5;
+     in4 = in4 - in5;
+               
+     in2 = in0 + in1;
+     in0 = in0 - in1;
+
+     out2[11-0] = in2 * wi[11-0];
+     out2[6 +0] = in2 * wi[6+0];
+     out2[6 +2] = in3 * wi[6+2];
+     out2[11-2] = in3 * wi[11-2];
+
+     out2[0+0] += in0 * wi[0];
+     out2[5-0] += in0 * wi[5-0];
+     out2[0+2] += in4 * wi[2];
+     out2[5-2] += in4 * wi[5-2];
+}
+
+OIL_DEFINE_IMPL (dct12_f32_mpglib, dct12_f32_class);
+
+
+
+#ifdef TEST_dct12_f32
+int TEST_dct12_f32(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	float *src, *dest_ref, *dest_test;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f32(18);
+	dest_ref = sl_malloc_f32(18);
+	dest_test = sl_malloc_f32(18);
+
+	sl_profile_init(t);
+	srand(20021001);
+
+	printf("I: " sl_stringify(dct12_f32_FUNC) "\n");
+
+	for(pass=0;pass<1;pass++){
+		for(i=0;i<18;i++){
+			src[i]=sl_rand_f32_0_1();
+			//src[i]=(i==pass+6);
+		}
+
+		dct12_f32_ref(dest_ref,src);
+		sl_profile_start(t);
+		dct12_f32_FUNC(dest_test,src);
+		sl_profile_stop(t);
+
+		for(i=0;i<18;i++){
+			if(dest_test[i] != dest_ref[i]){
+				printf("%d %g %g %g\n",i,src[i],dest_ref[i],
+						dest_test[i]);
+			}
+		}
+	}
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/dct36.c b/liboil/dct/dct36.c
new file mode 100644
index 0000000..9cfc885
--- /dev/null
+++ b/liboil/dct/dct36.c
@@ -0,0 +1,114 @@
+/*
+ DCT insipired by Jeff Tsay's DCT from the maplay package
+ this is an optimized version with manual unroll.
+
+ References:
+ [1] S. Winograd: "On Computing the Discrete Fourier Transform",
+     Mathematics of Computation, Volume 32, Number 141, January 1978,
+     Pages 175-199
+*/
+
+static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf)
+{
+  {
+    real *in = inbuf;
+
+    in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14];
+    in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11];
+    in[11]+=in[10]; in[10]+=in[9];  in[9] +=in[8];
+    in[8] +=in[7];  in[7] +=in[6];  in[6] +=in[5];
+    in[5] +=in[4];  in[4] +=in[3];  in[3] +=in[2];
+    in[2] +=in[1];  in[1] +=in[0];
+
+    in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9];
+    in[9] +=in[7];  in[7] +=in[5];  in[5] +=in[3];  in[3] +=in[1];
+
+
+  {
+
+#define MACRO0(v) { \
+    real tmp; \
+    out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \
+    out2[8-(v)] = tmp * w[26-(v)];  } \
+    sum0 -= sum1; \
+    ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \
+    ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)]; 
+#define MACRO1(v) { \
+	real sum0,sum1; \
+    sum0 = tmp1a + tmp2a; \
+	sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \
+	MACRO0(v); }
+#define MACRO2(v) { \
+    real sum0,sum1; \
+    sum0 = tmp2a - tmp1a; \
+    sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
+	MACRO0(v); }
+
+    const real *c = COS9;
+    real *out2 = o2;
+	real *w = wintab;
+	real *out1 = o1;
+	real *ts = tsbuf;
+
+    real ta33,ta66,tb33,tb66;
+
+    ta33 = in[2*3+0] * c[3];
+    ta66 = in[2*6+0] * c[6];
+    tb33 = in[2*3+1] * c[3];
+    tb66 = in[2*6+1] * c[6];
+
+    { 
+      real tmp1a,tmp2a,tmp1b,tmp2b;
+      tmp1a =             in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7];
+      tmp1b =             in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7];
+      tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8];
+      tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8];
+
+      MACRO1(0);
+      MACRO2(8);
+    }
+
+    {
+      real tmp1a,tmp2a,tmp1b,tmp2b;
+      tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3];
+      tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3];
+      tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0];
+      tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1];
+
+      MACRO1(1);
+      MACRO2(7);
+    }
+
+    {
+      real tmp1a,tmp2a,tmp1b,tmp2b;
+      tmp1a =             in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1];
+      tmp1b =             in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1];
+      tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4];
+      tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4];
+
+      MACRO1(2);
+      MACRO2(6);
+    }
+
+    {
+      real tmp1a,tmp2a,tmp1b,tmp2b;
+      tmp1a =             in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5];
+      tmp1b =             in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5];
+      tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2];
+      tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2];
+
+      MACRO1(3);
+      MACRO2(5);
+    }
+
+	{
+		real sum0,sum1;
+    	sum0 =  in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0];
+    	sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4];
+		MACRO0(4);
+	}
+  }
+
+  }
+}
+
diff --git a/liboil/dct/dct36_f32.c b/liboil/dct/dct36_f32.c
new file mode 100644
index 0000000..061c84d
--- /dev/null
+++ b/liboil/dct/dct36_f32.c
@@ -0,0 +1,86 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+
+OIL_DEFINE_CLASS_X(dct36_f32, "float *dest, int dstr, float *src, int sstr, int n");
+
+static void
+dct36_f32_ref(float *dest, int dstr, float *src, int sstr, int n)
+{
+  int i;
+  for(i=0;i<n;i++){
+    dest[i] = src[i];
+  }
+}
+
+OIL_DEFINE_IMPL_REF (dct36_f32_ref, dct36_f32_class);
+
+#ifdef TEST_dct36_f32
+int TEST_dct36_f32(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	f64 *src, *dest_ref, *dest_test;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f64(N);
+	dest_ref = sl_malloc_f64(N);
+	dest_test = sl_malloc_f64(N);
+
+	sl_profile_init(t);
+	srand(20021001);
+
+	printf("I: " sl_stringify(dct36_f32_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<N;i++)src[i]=sl_rand_f64_s16();
+
+		dct36_f32_ref(dest_ref,src,N);
+		sl_profile_start(t);
+		dct36_f32_FUNC(dest_test,src,N);
+		sl_profile_stop(t);
+
+		for(i=0;i<N;i++){
+			if(dest_test[i] != dest_ref[i]){
+				printf("%d %g %g %g\n",i,src[i],dest_ref[i],
+						dest_test[i]);
+			}
+		}
+	}
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8_f64.c b/liboil/dct/fdct8_f64.c
new file mode 100644
index 0000000..3fc93a0
--- /dev/null
+++ b/liboil/dct/fdct8_f64.c
@@ -0,0 +1,236 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (fdct8_f64, "double *dest, double *src, int dstr, int sstr");
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+static void
+fdct8_f64_ref (double *dest, double *src, int dstr, int sstr)
+{
+	static double fdct_coeff[8][8];
+	static int fdct_coeff_init = 0;
+	int i,j;
+	double x;
+
+	if(!fdct_coeff_init){
+		double scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				fdct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		fdct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		x = 0;
+		for(j=0;j<8;j++){
+			x += fdct_coeff[j][i] * OIL_GET(src,sstr*j, double);
+		}
+		OIL_GET(dest,dstr*i, double) = x;
+	}
+}
+
+OIL_DEFINE_IMPL_REF (fdct8_f64_ref, fdct8_f64_class);
+
+/*
+ * This algorithm is roughly similar to a Fast-Fourier transform,
+ * taking advantage of the symmeties of the base vectors.  For
+ * reference, the base vectors are (horizontally):
+ *
+ * 0: 1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000 
+ * 1: 0.9808  0.8315  0.5556  0.1951 -0.1951 -0.5556 -0.8315 -0.9808 
+ * 2: 0.9239  0.3827 -0.3827 -0.9239 -0.9239 -0.3827  0.3827  0.9239 
+ * 3: 0.8315 -0.1951 -0.9808 -0.5556  0.5556  0.9808  0.1951 -0.8315 
+ * 4: 0.7071 -0.7071 -0.7071  0.7071  0.7071 -0.7071 -0.7071  0.7071 
+ * 5: 0.5556 -0.9808  0.1951  0.8315 -0.8315 -0.1951  0.9808 -0.5556 
+ * 6: 0.3827 -0.9239  0.9239 -0.3827 -0.3827  0.9239 -0.9239  0.3827 
+ * 7: 0.1951 -0.5556  0.8315 -0.9808  0.9808 -0.8315  0.5556 -0.1951 
+ *
+ * The symmetries of note: 
+ *  - even vectors are symmetric around 4 (the middle)
+ *  - odd vectors are antisymmetric around 4
+ *  - 0,4 are symmertic around 2 and 6
+ *  - 2,6 are antisymmetic around 2 and 6
+ *
+ * f0 = (x0 + x7) + (x1 + x6) + (x2 + x5) + (x3 + x4);
+ * f1 = 0.9808*(x0 - x7) + 0.8315*(x1 - x6) + 0.5556*(x2 - x5) + 0.1951*(x3 - x4)
+ * f2 = 0.9239*(x0 + x7) + 0.3827*(x1 + x6) - 0.3827*(x2 + x5) - 0.9239*(x3 + x4)
+ * f3 = 0.8315*(x0 - x7) - 0.1951*(x1 - x6) - 0.9808*(x2 - x5) - 0.5556*(x3 - x4)
+ * f4 = 0.7071*((x0 + x7) - (x1 + x6) - (x2 + x5) + (x3 + x4))
+ * f5 = 0.5556*(x0 - x7) - 0.9808*(x1 - x6) + 0.1951*(x2 - x5) + 0.8315*(x3 - x4)
+ * f6 = 0.3827*(x0 + x7) - 0.9239*(x1 + x6) + 0.9239*(x2 + x5) - 0.3827*(x3 + x4)
+ * f7 = 0.1951*(x0 - x7) - 0.5556*(x1 - x6) + 0.8315*(x2 - x5) - 0.9808*(x3 - x4)
+ *
+ * The even vectors can be further simplified:
+ *
+ * f0 = ((x0 + x7) + (x3 + x4)) + ((x1 + x6) + (x2 + x5));
+ * f2 = 0.9239*((x0 + x7) - (x3 + x4)) + 0.3827*((x1 + x6) - (x2 + x5))
+ * f4 = 0.7071*((x0 + x7) + (x3 + x4)) - 0.7071*((x1 + x6) + (x2 + x5))
+ * f6 = 0.3827*((x0 + x7) - (x3 + x4)) - 0.9239*((x1 + x6) - (x2 + x5))
+ *
+ * Some implementations move some of the normalization to a later
+ * stage of processing, saving a few multiplies which get absorbed
+ * into later multiplies.  However, this incurs a bit of error in
+ * the integer versions of this function.  Also, if the CPU has a
+ * multiply-and-add function, you don't gain anything.
+ */
+
+static void
+fdct8_f64_fast(double *dest, double *src, int dstr, int sstr)
+{
+	double s07, s16, s25, s34;
+	double d07, d16, d25, d34;
+	double ss07s34, ds07s34, ss16s25, ds16s25;
+
+	s07 = OIL_GET(src,sstr*0,double) + OIL_GET(src,sstr*7,double);
+	d07 = OIL_GET(src,sstr*0,double) - OIL_GET(src,sstr*7,double);
+	s16 = OIL_GET(src,sstr*1,double) + OIL_GET(src,sstr*6,double);
+	d16 = OIL_GET(src,sstr*1,double) - OIL_GET(src,sstr*6,double);
+	s25 = OIL_GET(src,sstr*2,double) + OIL_GET(src,sstr*5,double);
+	d25 = OIL_GET(src,sstr*2,double) - OIL_GET(src,sstr*5,double);
+	s34 = OIL_GET(src,sstr*3,double) + OIL_GET(src,sstr*4,double);
+	d34 = OIL_GET(src,sstr*3,double) - OIL_GET(src,sstr*4,double);
+
+	ss07s34 = s07 + s34;
+	ds07s34 = s07 - s34;
+	ss16s25 = s16 + s25;
+	ds16s25 = s16 - s25;
+
+	OIL_GET(dest,dstr*0,double) = 0.5*C0_7071*(ss07s34 + ss16s25);
+	OIL_GET(dest,dstr*2,double) = 0.5*(C0_9239*ds07s34 + C0_3827*ds16s25);
+	OIL_GET(dest,dstr*4,double) = 0.5*C0_7071*(ss07s34 - ss16s25);
+	OIL_GET(dest,dstr*6,double) = 0.5*(C0_3827*ds07s34 - C0_9239*ds16s25);
+
+	OIL_GET(dest,dstr*1,double) = 0.5*(C0_9808*d07 + C0_8315*d16
+			+ C0_5556*d25 + C0_1951*d34);
+	OIL_GET(dest,dstr*3,double) = 0.5*(C0_8315*d07 - C0_1951*d16
+			- C0_9808*d25 - C0_5556*d34);
+	OIL_GET(dest,dstr*5,double) = 0.5*(C0_5556*d07 - C0_9808*d16
+			+ C0_1951*d25 + C0_8315*d34);
+	OIL_GET(dest,dstr*7,double) = 0.5*(C0_1951*d07 - C0_5556*d16
+			+ C0_8315*d25 - C0_9808*d34);
+ 
+#if 0
+	z1 = (ds1  tmp12 + tmp13) * 0.707106781;
+	OIL_GET(dest,dstr*2,double) = (tmp13 + z1)*(0.25*M_SQRT2)*0.765366864;
+	OIL_GET(dest,dstr*6,double) = (tmp13 - z1)*(0.25*M_SQRT2)*1.847759066;
+
+	tmp10 = tmp4 + tmp5;
+	tmp11 = tmp5 + tmp6;
+	tmp12 = tmp6 + tmp7;
+
+	z5 = (tmp10 - tmp12) * 0.382683433;
+	z2 = 0.541196100 * tmp10 + z5;
+	z4 = 1.306562965 * tmp12 + z5;
+	z3 = tmp11 * 0.707106781;
+
+	z11 = tmp7 + z3;
+	z13 = tmp7 - z3;
+
+	OIL_GET(dest,dstr*5,double) = (z13 + z2)*(0.25*M_SQRT2)*1.2728;
+	OIL_GET(dest,dstr*3,double) = (z13 - z2)*(0.25*M_SQRT2)*0.8504;
+	OIL_GET(dest,dstr*1,double) = (z11 + z4)*(0.25*M_SQRT2)*0.7209;
+	OIL_GET(dest,dstr*7,double) = (z11 - z4)*(0.25*M_SQRT2)*3.6245;
+#endif
+}
+OIL_DEFINE_IMPL (fdct8_f64_fast, fdct8_f64_class);
+
+
+
+
+#ifdef TEST_fdct8_f64
+int TEST_fdct8_f64(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	double *src, *dest_ref, *dest_test;
+	double sad;
+	double sad_sum;
+	double sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f64(8);
+	dest_ref = sl_malloc_f64(8);
+	dest_test = sl_malloc_f64(8);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(fdct8_f64_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1();
+
+		//block8_dump(src);
+
+		fdct8_f64_ref(dest_test, src, 8, 8);
+		//block8_dump(dest_test);
+
+		sl_profile_start(t);
+		fdct8_f64_FUNC(dest_ref, src, 8, 8);
+		sl_profile_stop(t);
+		//block8_dump(dest_ref);
+
+		sad = 0;
+		for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 1.0){
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",sad_sum/N_PASS);
+	printf("sad max: %g\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8_f64.c b/liboil/dct/fdct8x8_f64.c
new file mode 100644
index 0000000..e59b455
--- /dev/null
+++ b/liboil/dct/fdct8x8_f64.c
@@ -0,0 +1,192 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilfuncs.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+
+OIL_DEFINE_CLASS(fdct8x8_f64, NULL);
+
+
+static void fdct8x8_f64_ref(double *dest, int dstr, double *src, int sstr)
+{
+	static double fdct_coeff[8][8];
+	static int fdct_coeff_init = 0;
+	int i,j,k,l;
+	double tmp1,tmp2;
+
+	if(!fdct_coeff_init){
+		double scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				fdct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		fdct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			tmp1 = 0;
+			for(k=0;k<8;k++){
+				tmp2 = 0;
+				for(l=0;l<8;l++){
+					tmp2 += fdct_coeff[l][j] *
+					  OIL_GET (src, sstr*k + l, double);
+				}
+				tmp1 += fdct_coeff[k][i] * tmp2;
+			}
+			OIL_GET (dest, dstr*i + j, double) = tmp1;
+		}
+	}
+}
+
+OIL_DEFINE_IMPL_REF (fdct8x8_f64_ref, fdct8x8_f64_class);
+
+static void
+fdct8x8_f64_ref2(double *dest, int dstr, double *src, int sstr)
+{
+	static double fdct_coeff[8][8];
+	static int fdct_coeff_init = 0;
+	int i,j,k;
+	double x;
+	double tmp[64];
+
+	if(!fdct_coeff_init){
+		double scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				fdct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		fdct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			x = 0;
+			for(k=0;k<8;k++){
+				x += fdct_coeff[k][j] *
+				  OIL_GET (src, sstr*i + k, double);
+			}
+			tmp[8*i+j] = x;
+		}
+	}
+
+	for(j=0;j<8;j++){
+		for(i=0;i<8;i++){
+			x = 0;
+			for(k=0;k<8;k++){
+				x += fdct_coeff[k][i] * tmp[8*k + j];
+			}
+			OIL_GET (dest,dstr*i+j, double) = x;
+		}
+	}
+}
+
+OIL_DEFINE_IMPL (fdct8x8_f64_ref2, fdct8x8_f64_class);
+
+
+static void
+fdct8x8_f64_1d (double *dest, int dstr, double *src, int sstr)
+{
+	int i;
+	double tmp[64];
+
+	for(i=0;i<8;i++){
+		fdct8_f64(tmp + i*8, sizeof(double), OIL_OFFSET(src,sstr*i),
+		    sizeof(double));
+	}
+
+	for(i=0;i<8;i++){
+		fdct8_f64(dest + i, dstr, tmp + i, 8*sizeof(double));
+	}
+}
+
+OIL_DEFINE_IMPL (fdct8x8_f64_1d, fdct8x8_f64_class);
+
+
+#ifdef TEST_fdct8x8_f64
+int TEST_fdct8x8_f64(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	double *src, *dest_ref, *dest_test;
+	double sad;
+	double sad_sum;
+	double sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f64(64);
+	dest_ref = sl_malloc_f64(64);
+	dest_test = sl_malloc_f64(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(fdct8x8_f64_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1();
+
+		fdct8x8_f64_ref(dest_test, src, 8, 8);
+		sl_profile_start(t);
+		fdct8x8_f64_FUNC(dest_ref, src, 8, 8);
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 1.0){
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",sad_sum/N_PASS);
+	printf("sad max: %g\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8_s16.c b/liboil/dct/fdct8x8_s16.c
new file mode 100644
index 0000000..4b8f045
--- /dev/null
+++ b/liboil/dct/fdct8x8_s16.c
@@ -0,0 +1,126 @@
+/* forward discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: fdct8x8_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _fdct8x8_s16_h_
+#define _fdct8x8_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_fdct8x8_s16_storage
+ #ifdef SL_storage
+  #define SL_fdct8x8_s16_storage SL_storage
+ #else
+  #define SL_fdct8x8_s16_storage static inline
+ #endif
+#endif
+
+
+
+#include <fdct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL fdct8x8_s16_ref */
+SL_fdct8x8_s16_storage
+void fdct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+	f64 s[64], d[64];
+	int i,j;
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			block8x8_f64(s,8*sizeof(f64),i,j) =
+				block8x8_s16(src,sstr,i,j);
+		}
+	}
+
+	fdct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+	conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_fdct8x8_s16
+int TEST_fdct8x8_s16(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	s16 *src, *dest_ref, *dest_test;
+	u32 sad;
+	u32 sad_sum;
+	u32 sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_s16(64);
+	dest_ref = sl_malloc_s16(64);
+	dest_test = sl_malloc_s16(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(fdct8x8_s16_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+		fdct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_start(t);
+		fdct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 64){
+			block8x8_dump_s16(src, 8*sizeof(s16));
+			block8x8_dump_s16(dest_test, 8*sizeof(s16));
+			block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+	printf("sad max: %d\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8s_s16.c b/liboil/dct/fdct8x8s_s16.c
new file mode 100644
index 0000000..4176298
--- /dev/null
+++ b/liboil/dct/fdct8x8s_s16.c
@@ -0,0 +1,138 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+
+
+OIL_DEFINE_CLASS(fdct8x8s_s16, NULL);
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+/*
+Alternate scaling used by RTjpeg.
+*/
+
+
+
+static void
+fdct8x8s_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+	double s[64], d[64];
+	double scale[8] = {
+		4*C0_7071,
+		4*C0_9808,
+		4*C0_9239,
+		4*C0_8315,
+		4*C0_7071,
+		4*C0_5556,
+		4*C0_3827,
+		4*C0_1951,
+	};
+	int i,j;
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			s[8*i+j] = OIL_GET (src,sstr*i+j, int16_t);
+		}
+	}
+
+	fdct8x8_f64(d,8*sizeof(double),s,8*sizeof(double));
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			d[8*i+j] *= scale[i] * scale[j];
+		}
+	}
+
+	conv8x8_s16_f64(dest,dstr,d,8*sizeof(double));
+}
+
+OIL_DEFINE_IMPL_REF (fdct8x8s_s16_ref, fdct8x8s_s16_class);
+
+#ifdef TEST_fdct8x8s_s16
+int TEST_fdct8x8s_s16(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	int16_t *src, *dest_ref, *dest_test;
+	u32 sad;
+	u32 sad_sum;
+	u32 sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_s16(64);
+	dest_ref = sl_malloc_s16(64);
+	dest_test = sl_malloc_s16(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(fdct8x8s_s16_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+		fdct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_start(t);
+		fdct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 128){
+			//block8x8_dump_s16(src, 8*sizeof(s16));
+			//block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+			//block8x8_dump_s16(dest_test, 8*sizeof(s16));
+			//block8x8_dumpdiff_s16(dest_test, dest_ref, 8*sizeof(s16));
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+	printf("sad max: %d\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8_f64.c b/liboil/dct/idct8_f64.c
new file mode 100644
index 0000000..fa3a952
--- /dev/null
+++ b/liboil/dct/idct8_f64.c
@@ -0,0 +1,172 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (idct8_f64, "double *dest, int dstr, double *src, int sstr");
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+static void idct8_f64_ref(double *dest, int dstr, double *src, int sstr)
+{
+	static double idct_coeff[8][8];
+	static int idct_coeff_init = 0;
+	int i,j;
+	double x;
+
+	if(!idct_coeff_init){
+		double scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				idct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		idct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		x = 0;
+		for(j=0;j<8;j++){
+			x += idct_coeff[i][j] * OIL_GET (src, sstr*j, double);
+		}
+		OIL_GET (dest, dstr*i, double) = x;
+	}
+}
+
+OIL_DEFINE_IMPL_REF (idct8_f64_ref, idct8_f64_class);
+
+
+static void idct8_f64_fastx(double *dest, int dstr, double *src, int sstr)
+{
+	double s07, s16, s25, s34;
+	double d07, d16, d25, d34;
+	double ss07s34, ss16s25;
+	double ds07s34, ds16s25;
+
+	ss07s34 = C0_7071*(OIL_GET(src,sstr*0, double) + OIL_GET(src,sstr*4, double));
+	ss16s25 = C0_7071*(OIL_GET(src,sstr*0, double) - OIL_GET(src,sstr*4, double));
+
+	ds07s34 = C0_9239* OIL_GET(src,sstr*2, double) + C0_3827* OIL_GET(src,sstr*6, double);
+	ds16s25 = C0_3827* OIL_GET(src,sstr*2, double) - C0_9239* OIL_GET(src,sstr*6, double);
+
+	s07 = ss07s34 + ds07s34;
+	s34 = ss07s34 - ds07s34;
+
+	s16 = ss16s25 + ds16s25;
+	s25 = ss16s25 - ds16s25;
+
+	d07 = C0_9808* OIL_GET(src,sstr*1, double) + C0_8315* OIL_GET(src,sstr*3, double)
+		+ C0_5556* OIL_GET(src,sstr*5, double) + C0_1951* OIL_GET(src,sstr*7, double);
+	d16 = C0_8315* OIL_GET(src,sstr*1, double) - C0_1951* OIL_GET(src,sstr*3, double)
+		- C0_9808* OIL_GET(src,sstr*5, double) - C0_5556* OIL_GET(src,sstr*7, double);
+	d25 = C0_5556* OIL_GET(src,sstr*1, double) - C0_9808* OIL_GET(src,sstr*3, double)
+		+ C0_1951* OIL_GET(src,sstr*5, double) + C0_8315* OIL_GET(src,sstr*7, double);
+	d34 = C0_1951* OIL_GET(src,sstr*1, double) - C0_5556* OIL_GET(src,sstr*3, double)
+		+ C0_8315* OIL_GET(src,sstr*5, double) - C0_9808* OIL_GET(src,sstr*7, double);
+
+	OIL_GET(dest,dstr*0, double) = 0.5 * (s07 + d07);
+	OIL_GET(dest,dstr*1, double) = 0.5 * (s16 + d16);
+	OIL_GET(dest,dstr*2, double) = 0.5 * (s25 + d25);
+	OIL_GET(dest,dstr*3, double) = 0.5 * (s34 + d34);
+	OIL_GET(dest,dstr*4, double) = 0.5 * (s34 - d34);
+	OIL_GET(dest,dstr*5, double) = 0.5 * (s25 - d25);
+	OIL_GET(dest,dstr*6, double) = 0.5 * (s16 - d16);
+	OIL_GET(dest,dstr*7, double) = 0.5 * (s07 - d07);
+
+}
+
+OIL_DEFINE_IMPL (idct8_f64_fastx, idct8_f64_class);
+
+
+#ifdef TEST_idct8_f64
+int TEST_idct8_f64(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	double *src, *dest_ref, *dest_test;
+	double sad;
+	double sad_sum;
+	double sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f64(8);
+	dest_ref = sl_malloc_f64(8);
+	dest_test = sl_malloc_f64(8);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(idct8_f64_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1();
+
+		//block8_dump(src);
+
+		idct8_f64_ref(dest_test, src, 8, 8);
+		//block8_dump(dest_test);
+
+		sl_profile_start(t);
+		idct8_f64_FUNC(dest_ref, src, 8, 8);
+		sl_profile_stop(t);
+		//block8_dump(dest_ref);
+
+		sad = 0;
+		for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 1.0){
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",sad_sum/N_PASS);
+	printf("sad max: %g\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8_c.c b/liboil/dct/idct8x8_c.c
new file mode 100644
index 0000000..648065c
--- /dev/null
+++ b/liboil/dct/idct8x8_c.c
@@ -0,0 +1,118 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2001,2002,2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboil.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+/* for bootstrapping */
+#ifndef idct8x8_f64
+extern OilFunctionClass _oil_function_idct8x8_f64_class;
+#define idct8x8_f64 ((void (*)(double *dest, int dstr, double *src, int sstr)) \
+            _oil_function_idct8x8_f64_class.func)
+#endif
+
+#define BLOCK8x8_F64(ptr, stride, row, column) \
+	(*((double *)((void *)ptr + stride*row) + column))
+
+#define BLOCK8x8_PTR_F64(ptr, stride, row, column) \
+	((double *)((void *)ptr + stride*row) + column)
+
+#define BLOCK8x8_S16(ptr, stride, row, column) \
+	(*((int16_t *)((void *)ptr + stride*row) + column))
+
+OIL_DEFINE_CLASS_X (idct8x8_f64, "double *dest, int dstr, double *src, int sstr");
+OIL_DEFINE_CLASS_X (idct8x8_s16, "int16_t *dest, int dstr, int16_t *src, int sstr");
+
+static void
+idct8x8_f64_slow (double *dest, int dstr, double *src, int sstr)
+{
+	static double idct_coeff[8][8];
+	static int idct_coeff_init = 0;
+	int i,j,k,l;
+	double tmp1,tmp2;
+
+	if(!idct_coeff_init){
+		double scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				idct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		idct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			tmp1 = 0;
+			for(k=0;k<8;k++){
+				tmp2 = 0;
+				for(l=0;l<8;l++){
+					tmp2 += idct_coeff[j][l] *
+						BLOCK8x8_F64(src,sstr,k,l);
+				}
+				tmp1 += idct_coeff[i][k] * tmp2;
+			}
+			BLOCK8x8_F64(dest,dstr,i,j) = tmp1;
+		}
+	}
+}
+
+OIL_DEFINE_IMPL (idct8x8_f64_slow, idct8x8_f64_class);
+
+static void
+idct8x8_f64_c (double *dest, int dstr, double *src, int sstr)
+{
+	int i;
+	double tmp[64];
+	int tmpstr = 8*sizeof(double);
+
+	for(i=0;i<8;i++){
+		idct8_f64(
+			BLOCK8x8_PTR_F64(tmp,tmpstr,i,0), sizeof(double),
+			BLOCK8x8_PTR_F64(src,sstr,i,0), sizeof(double));
+	}
+	for(i=0;i<8;i++){
+		idct8_f64(
+			BLOCK8x8_PTR_F64(dest,dstr,0,i), dstr,
+			BLOCK8x8_PTR_F64(tmp,tmpstr,0,i), tmpstr);
+	}
+}
+
+OIL_DEFINE_IMPL_DEPENDS (idct8x8_f64_c, idct8x8_f64_class, idct8_f64_class);
+
+static void
+idct8x8_s16_slow (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+	double s[64], d[64];
+
+	conv8x8_f64_s16 (s,8*sizeof(double),src,sstr);
+	idct8x8_f64 (d,8*sizeof(double),s,8*sizeof(double));
+	conv8x8_s16_f64 (dest,dstr,d,8*sizeof(double));
+}
+
+OIL_DEFINE_IMPL_DEPENDS (idct8x8_s16_slow, idct8x8_s16_class,
+    conv8x8_f64_s16_class, idct8x8_f64_class, conv8x8_s16_f64);
+
diff --git a/liboil/dct/idct8x8_f64.c b/liboil/dct/idct8x8_f64.c
new file mode 100644
index 0000000..9e46523
--- /dev/null
+++ b/liboil/dct/idct8x8_f64.c
@@ -0,0 +1,206 @@
+/* inverse discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8_f64
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _idct8x8_f64_h_
+#define _idct8x8_f64_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8_f64_storage
+ #ifdef SL_storage
+  #define SL_idct8x8_f64_storage SL_storage
+ #else
+  #define SL_idct8x8_f64_storage static inline
+ #endif
+#endif
+
+
+/* IMPL idct8x8_f64_ref */
+SL_idct8x8_f64_storage
+void idct8x8_f64_ref(f64 *dest, f64 *src, int dstr, int sstr)
+{
+	static f64 idct_coeff[8][8];
+	static int idct_coeff_init = 0;
+	int i,j,k,l;
+	f64 tmp1,tmp2;
+
+	if(!idct_coeff_init){
+		f64 scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				idct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		idct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			tmp1 = 0;
+			for(k=0;k<8;k++){
+				tmp2 = 0;
+				for(l=0;l<8;l++){
+					tmp2 += idct_coeff[j][l] *
+						block8x8_f64(src,sstr,k,l);
+				}
+				tmp1 += idct_coeff[i][k] * tmp2;
+			}
+			block8x8_f64(dest,dstr,i,j) = tmp1;
+		}
+	}
+}
+
+/* IMPL idct8x8_f64_ref2 */
+SL_idct8x8_f64_storage
+void idct8x8_f64_ref2(f64 *dest, f64 *src, int dstr, int sstr)
+{
+	static f64 idct_coeff[8][8];
+	static int idct_coeff_init = 0;
+	int i,j,k;
+	f64 x;
+	f64 tmp[64];
+
+	if(!idct_coeff_init){
+		f64 scale;
+
+		for(i=0;i<8;i++){
+			scale = (i==0) ? sqrt(0.125) : 0.5;
+			for(j=0;j<8;j++){
+				idct_coeff[j][i] = scale *
+					cos((M_PI/8)*i*(j+0.5));
+			}
+		}
+		idct_coeff_init = 1;
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			x = 0;
+			for(k=0;k<8;k++){
+				x += idct_coeff[j][k] *
+					block8x8_f64(src,sstr,i,k);
+			}
+			tmp[8*i+j] = x;
+		}
+	}
+
+	for(j=0;j<8;j++){
+		for(i=0;i<8;i++){
+			x = 0;
+			for(k=0;k<8;k++){
+				x += idct_coeff[i][k] * tmp[8*k + j];
+			}
+			block8x8_f64(dest,dstr,i,j) = x;
+		}
+	}
+}
+
+#define f64_addr(base,str,i) ((f64 *)((void *)(base) + (str)*(i)))
+
+#include <idct8_f64.h>
+/* IMPL idct8x8_f64_1d */
+SL_idct8x8_f64_storage
+void idct8x8_f64_1d(f64 *dest, f64 *src, int dstr, int sstr)
+{
+	int i;
+	f64 tmp[64];
+
+	for(i=0;i<8;i++){
+		idct8_f64_fast(tmp + i*8, f64_addr(src,sstr,i), sizeof(f64), sizeof(f64));
+	}
+	for(i=0;i<8;i++){
+		idct8_f64_fast(dest + i, tmp + i, dstr, 8*sizeof(f64));
+	}
+}
+
+
+#endif
+
+
+#ifdef TEST_idct8x8_f64
+int TEST_idct8x8_f64(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	f64 *src, *dest_ref, *dest_test;
+	f64 sad;
+	f64 sad_sum;
+	f64 sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_f64(64);
+	dest_ref = sl_malloc_f64(64);
+	dest_test = sl_malloc_f64(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(idct8x8_f64_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1();
+
+		idct8x8_f64_ref(dest_test, src, 8, 8);
+		sl_profile_start(t);
+		idct8x8_f64_FUNC(dest_ref, src, 8, 8);
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 1.0){
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",sad_sum/N_PASS);
+	printf("sad max: %g\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8_s16.c b/liboil/dct/idct8x8_s16.c
new file mode 100644
index 0000000..1e58026
--- /dev/null
+++ b/liboil/dct/idct8x8_s16.c
@@ -0,0 +1,144 @@
+/* forward discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _idct8x8_s16_h_
+#define _idct8x8_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8_s16_storage
+ #ifdef SL_storage
+  #define SL_idct8x8_s16_storage SL_storage
+ #else
+  #define SL_idct8x8_s16_storage static inline
+ #endif
+#endif
+
+
+
+#include <idct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL idct8x8_s16_ref */
+SL_idct8x8_s16_storage
+void idct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+	f64 s[64], d[64];
+	int i,j;
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			block8x8_f64(s,8*sizeof(f64),i,j) =
+				block8x8_s16(src,sstr,i,j);
+		}
+	}
+
+	idct8x8_f64_ref(d,s,8*sizeof(f64),8*sizeof(f64));
+	conv8x8_f64_s16_ref(dest,d,dstr,8*sizeof(f64));
+}
+
+/* IMPL idct8x8_s16_fast */
+SL_idct8x8_s16_storage
+void idct8x8_s16_fast(s16 *dest, s16 *src, int dstr, int sstr)
+{
+	f64 s[64], d[64];
+	int i,j;
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			block8x8_f64(s,8*sizeof(f64),i,j) =
+				block8x8_s16(src,sstr,i,j);
+		}
+	}
+
+	idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+	conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_idct8x8_s16
+int TEST_idct8x8_s16(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	s16 *src, *dest_ref, *dest_test;
+	u32 sad;
+	u32 sad_sum;
+	u32 sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_s16(64);
+	dest_ref = sl_malloc_s16(64);
+	dest_test = sl_malloc_s16(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(idct8x8_s16_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+		idct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_start(t);
+		idct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 64){
+			block8x8_dump_s16(src, 8*sizeof(s16));
+			block8x8_dump_s16(dest_test, 8*sizeof(s16));
+			block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+	printf("sad max: %d\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8s_s16.c b/liboil/dct/idct8x8s_s16.c
new file mode 100644
index 0000000..59e9709
--- /dev/null
+++ b/liboil/dct/idct8x8s_s16.c
@@ -0,0 +1,148 @@
+/* inverse discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8s_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+Alternate scaling used by RTjpeg.
+*/
+
+#ifndef _idct8x8s_s16_h_
+#define _idct8x8s_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8s_s16_storage
+ #ifdef SL_storage
+  #define SL_idct8x8s_s16_storage SL_storage
+ #else
+  #define SL_idct8x8s_s16_storage static inline
+ #endif
+#endif
+
+
+/* extras */
+#include <rtjpeg/idct8x8s_s16.h>
+
+
+#include <idct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL idct8x8s_s16_ref */
+SL_idct8x8s_s16_storage
+void idct8x8s_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+	f64 s[64], d[64];
+	const f64 scale[8] = {
+		2.0/C0_7071,
+		2.0/C0_9808,
+		2.0/C0_9239,
+		2.0/C0_8315,
+		2.0/C0_7071,
+		2.0/C0_5556,
+		2.0/C0_3827,
+		2.0/C0_1951,
+	};
+	int i,j;
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			block8x8_f64(s,8*sizeof(f64),i,j) =
+				block8x8_s16(src,sstr,i,j);
+		}
+	}
+
+	for(i=0;i<8;i++){
+		for(j=0;j<8;j++){
+			block8x8_f64(s,8*sizeof(f64),i,j) *= scale[i] * scale[j];
+		}
+	}
+
+	idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+
+	conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_idct8x8s_s16
+int TEST_idct8x8s_s16(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	s16 *src, *dest_ref, *dest_test;
+	u32 sad;
+	u32 sad_sum;
+	u32 sad_max;
+	struct sl_profile_struct t;
+
+	src = sl_malloc_s16(64);
+	dest_ref = sl_malloc_s16(64);
+	dest_test = sl_malloc_s16(64);
+	
+	sl_profile_init(t);
+	srand(20020306);
+
+	sad_sum = 0;
+	sad_max = 0;
+
+	printf("I: " sl_stringify(idct8x8s_s16_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		//for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+		for(i=0;i<64;i++)src[i] = (i==pass)*1000;
+
+		idct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_start(t);
+		idct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+		if(sad_max<sad)sad_max = sad;
+		sad_sum += sad;
+		if(sad >= 128){
+			block8x8_dump_s16(src, 8*sizeof(s16));
+			block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+			block8x8_dump_s16(dest_test, 8*sizeof(s16));
+			block8x8_dumpratio_s16(dest_test, dest_ref, 8*sizeof(s16));
+			failures++;
+		}
+	}
+	printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+	printf("sad max: %d\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
diff --git a/liboil/dct/imdct32_f32.c b/liboil/dct/imdct32_f32.c
new file mode 100644
index 0000000..0a39950
--- /dev/null
+++ b/liboil/dct/imdct32_f32.c
@@ -0,0 +1,438 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003  David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (imdct32_f32, "float *dest, float *src");
+
+static void imdct32_f32_ref (float *dest, float *src)
+{
+	double x;
+	int i,j;
+	double coeff;
+
+	for(i=0;i<32;i++){
+		x = 0;
+		for(j=0;j<32;j++){
+			coeff = cos((M_PI/32)*i*(j+0.5));
+			x += coeff * src[j];
+		}
+		dest[i] = x;
+	}
+}
+
+OIL_DEFINE_IMPL_REF (imdct32_f32_ref, imdct32_f32_class);
+
+/* from mpglib */
+/*
+ * Discrete Cosine Tansform (DCT) for subband synthesis
+ * optimized for machines with no auto-increment. 
+ * The performance is highly compiler dependend. Maybe
+ * the dct64.c version for 'normal' processor may be faster
+ * even for Intel processors.
+ */
+
+static void imdct32_f32_mpglib(float *dest,float *samples)
+{
+	static float cos64[16];
+	static float cos32[16];
+	static float cos16[16];
+	static float cos8[16];
+	static float cos4[16];
+	float b1[32];
+	float b2[32];
+	static int done = 0;
+
+	if(!done){
+		int k;
+
+		done = 1;
+		for(k=0;k<16;k++){
+			cos64[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 64.0));
+			cos32[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 32.0));
+			cos16[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 16.0));
+			cos8[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 8.0));
+			cos4[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 4.0));
+		}
+	}
+
+ {
+  float *costab = cos64;
+
+  b1[0x00] = samples[0x00] + samples[0x1F];
+  b1[0x1F] = (samples[0x00] - samples[0x1F]) * costab[0x0];
+
+  b1[0x01] = samples[0x01] + samples[0x1E];
+  b1[0x1E] = (samples[0x01] - samples[0x1E]) * costab[0x1];
+
+  b1[0x02] = samples[0x02] + samples[0x1D];
+  b1[0x1D] = (samples[0x02] - samples[0x1D]) * costab[0x2];
+
+  b1[0x03] = samples[0x03] + samples[0x1C];
+  b1[0x1C] = (samples[0x03] - samples[0x1C]) * costab[0x3];
+
+  b1[0x04] = samples[0x04] + samples[0x1B];
+  b1[0x1B] = (samples[0x04] - samples[0x1B]) * costab[0x4];
+
+  b1[0x05] = samples[0x05] + samples[0x1A];
+  b1[0x1A] = (samples[0x05] - samples[0x1A]) * costab[0x5];
+
+  b1[0x06] = samples[0x06] + samples[0x19];
+  b1[0x19] = (samples[0x06] - samples[0x19]) * costab[0x6];
+
+  b1[0x07] = samples[0x07] + samples[0x18];
+  b1[0x18] = (samples[0x07] - samples[0x18]) * costab[0x7];
+
+  b1[0x08] = samples[0x08] + samples[0x17];
+  b1[0x17] = (samples[0x08] - samples[0x17]) * costab[0x8];
+
+  b1[0x09] = samples[0x09] + samples[0x16];
+  b1[0x16] = (samples[0x09] - samples[0x16]) * costab[0x9];
+
+  b1[0x0A] = samples[0x0A] + samples[0x15];
+  b1[0x15] = (samples[0x0A] - samples[0x15]) * costab[0xA];
+
+  b1[0x0B] = samples[0x0B] + samples[0x14];
+  b1[0x14] = (samples[0x0B] - samples[0x14]) * costab[0xB];
+
+  b1[0x0C] = samples[0x0C] + samples[0x13];
+  b1[0x13] = (samples[0x0C] - samples[0x13]) * costab[0xC];
+
+  b1[0x0D] = samples[0x0D] + samples[0x12];
+  b1[0x12] = (samples[0x0D] - samples[0x12]) * costab[0xD];
+
+  b1[0x0E] = samples[0x0E] + samples[0x11];
+  b1[0x11] = (samples[0x0E] - samples[0x11]) * costab[0xE];
+
+  b1[0x0F] = samples[0x0F] + samples[0x10];
+  b1[0x10] = (samples[0x0F] - samples[0x10]) * costab[0xF];
+ }
+
+
+ {
+  float *costab = cos32;
+
+  b2[0x00] = b1[0x00] + b1[0x0F]; 
+  b2[0x0F] = (b1[0x00] - b1[0x0F]) * costab[0];
+  b2[0x01] = b1[0x01] + b1[0x0E]; 
+  b2[0x0E] = (b1[0x01] - b1[0x0E]) * costab[1];
+  b2[0x02] = b1[0x02] + b1[0x0D]; 
+  b2[0x0D] = (b1[0x02] - b1[0x0D]) * costab[2];
+  b2[0x03] = b1[0x03] + b1[0x0C]; 
+  b2[0x0C] = (b1[0x03] - b1[0x0C]) * costab[3];
+  b2[0x04] = b1[0x04] + b1[0x0B]; 
+  b2[0x0B] = (b1[0x04] - b1[0x0B]) * costab[4];
+  b2[0x05] = b1[0x05] + b1[0x0A]; 
+  b2[0x0A] = (b1[0x05] - b1[0x0A]) * costab[5];
+  b2[0x06] = b1[0x06] + b1[0x09]; 
+  b2[0x09] = (b1[0x06] - b1[0x09]) * costab[6];
+  b2[0x07] = b1[0x07] + b1[0x08]; 
+  b2[0x08] = (b1[0x07] - b1[0x08]) * costab[7];
+
+  b2[0x10] = b1[0x10] + b1[0x1F];
+  b2[0x1F] = (b1[0x1F] - b1[0x10]) * costab[0];
+  b2[0x11] = b1[0x11] + b1[0x1E];
+  b2[0x1E] = (b1[0x1E] - b1[0x11]) * costab[1];
+  b2[0x12] = b1[0x12] + b1[0x1D];
+  b2[0x1D] = (b1[0x1D] - b1[0x12]) * costab[2];
+  b2[0x13] = b1[0x13] + b1[0x1C];
+  b2[0x1C] = (b1[0x1C] - b1[0x13]) * costab[3];
+  b2[0x14] = b1[0x14] + b1[0x1B];
+  b2[0x1B] = (b1[0x1B] - b1[0x14]) * costab[4];
+  b2[0x15] = b1[0x15] + b1[0x1A];
+  b2[0x1A] = (b1[0x1A] - b1[0x15]) * costab[5];
+  b2[0x16] = b1[0x16] + b1[0x19];
+  b2[0x19] = (b1[0x19] - b1[0x16]) * costab[6];
+  b2[0x17] = b1[0x17] + b1[0x18];
+  b2[0x18] = (b1[0x18] - b1[0x17]) * costab[7];
+ }
+
+ {
+  float *costab = cos16;
+
+  b1[0x00] = b2[0x00] + b2[0x07];
+  b1[0x07] = (b2[0x00] - b2[0x07]) * costab[0];
+  b1[0x01] = b2[0x01] + b2[0x06];
+  b1[0x06] = (b2[0x01] - b2[0x06]) * costab[1];
+  b1[0x02] = b2[0x02] + b2[0x05];
+  b1[0x05] = (b2[0x02] - b2[0x05]) * costab[2];
+  b1[0x03] = b2[0x03] + b2[0x04];
+  b1[0x04] = (b2[0x03] - b2[0x04]) * costab[3];
+
+  b1[0x08] = b2[0x08] + b2[0x0F];
+  b1[0x0F] = (b2[0x0F] - b2[0x08]) * costab[0];
+  b1[0x09] = b2[0x09] + b2[0x0E];
+  b1[0x0E] = (b2[0x0E] - b2[0x09]) * costab[1];
+  b1[0x0A] = b2[0x0A] + b2[0x0D];
+  b1[0x0D] = (b2[0x0D] - b2[0x0A]) * costab[2];
+  b1[0x0B] = b2[0x0B] + b2[0x0C];
+  b1[0x0C] = (b2[0x0C] - b2[0x0B]) * costab[3];
+
+  b1[0x10] = b2[0x10] + b2[0x17];
+  b1[0x17] = (b2[0x10] - b2[0x17]) * costab[0];
+  b1[0x11] = b2[0x11] + b2[0x16];
+  b1[0x16] = (b2[0x11] - b2[0x16]) * costab[1];
+  b1[0x12] = b2[0x12] + b2[0x15];
+  b1[0x15] = (b2[0x12] - b2[0x15]) * costab[2];
+  b1[0x13] = b2[0x13] + b2[0x14];
+  b1[0x14] = (b2[0x13] - b2[0x14]) * costab[3];
+
+  b1[0x18] = b2[0x18] + b2[0x1F];
+  b1[0x1F] = (b2[0x1F] - b2[0x18]) * costab[0];
+  b1[0x19] = b2[0x19] + b2[0x1E];
+  b1[0x1E] = (b2[0x1E] - b2[0x19]) * costab[1];
+  b1[0x1A] = b2[0x1A] + b2[0x1D];
+  b1[0x1D] = (b2[0x1D] - b2[0x1A]) * costab[2];
+  b1[0x1B] = b2[0x1B] + b2[0x1C];
+  b1[0x1C] = (b2[0x1C] - b2[0x1B]) * costab[3];
+ }
+
+ {
+  float cos0 = cos8[0];
+  float cos1 = cos8[1];
+
+  b2[0x00] = b1[0x00] + b1[0x03];
+  b2[0x03] = (b1[0x00] - b1[0x03]) * cos0;
+  b2[0x01] = b1[0x01] + b1[0x02];
+  b2[0x02] = (b1[0x01] - b1[0x02]) * cos1;
+
+  b2[0x04] = b1[0x04] + b1[0x07];
+  b2[0x07] = (b1[0x07] - b1[0x04]) * cos0;
+  b2[0x05] = b1[0x05] + b1[0x06];
+  b2[0x06] = (b1[0x06] - b1[0x05]) * cos1;
+
+  b2[0x08] = b1[0x08] + b1[0x0B];
+  b2[0x0B] = (b1[0x08] - b1[0x0B]) * cos0;
+  b2[0x09] = b1[0x09] + b1[0x0A];
+  b2[0x0A] = (b1[0x09] - b1[0x0A]) * cos1;
+  
+  b2[0x0C] = b1[0x0C] + b1[0x0F];
+  b2[0x0F] = (b1[0x0F] - b1[0x0C]) * cos0;
+  b2[0x0D] = b1[0x0D] + b1[0x0E];
+  b2[0x0E] = (b1[0x0E] - b1[0x0D]) * cos1;
+
+  b2[0x10] = b1[0x10] + b1[0x13];
+  b2[0x13] = (b1[0x10] - b1[0x13]) * cos0;
+  b2[0x11] = b1[0x11] + b1[0x12];
+  b2[0x12] = (b1[0x11] - b1[0x12]) * cos1;
+
+  b2[0x14] = b1[0x14] + b1[0x17];
+  b2[0x17] = (b1[0x17] - b1[0x14]) * cos0;
+  b2[0x15] = b1[0x15] + b1[0x16];
+  b2[0x16] = (b1[0x16] - b1[0x15]) * cos1;
+
+  b2[0x18] = b1[0x18] + b1[0x1B];
+  b2[0x1B] = (b1[0x18] - b1[0x1B]) * cos0;
+  b2[0x19] = b1[0x19] + b1[0x1A];
+  b2[0x1A] = (b1[0x19] - b1[0x1A]) * cos1;
+
+  b2[0x1C] = b1[0x1C] + b1[0x1F];
+  b2[0x1F] = (b1[0x1F] - b1[0x1C]) * cos0;
+  b2[0x1D] = b1[0x1D] + b1[0x1E];
+  b2[0x1E] = (b1[0x1E] - b1[0x1D]) * cos1;
+ }
+
+ {
+  float cos0 = cos4[0];
+
+  b1[0x00] = b2[0x00] + b2[0x01];
+  b1[0x01] = (b2[0x00] - b2[0x01]) * cos0;
+  b1[0x02] = b2[0x02] + b2[0x03];
+  b1[0x03] = (b2[0x03] - b2[0x02]) * cos0;
+  b1[0x02] += b1[0x03];
+
+  b1[0x04] = b2[0x04] + b2[0x05];
+  b1[0x05] = (b2[0x04] - b2[0x05]) * cos0;
+  b1[0x06] = b2[0x06] + b2[0x07];
+  b1[0x07] = (b2[0x07] - b2[0x06]) * cos0;
+  b1[0x06] += b1[0x07];
+  b1[0x04] += b1[0x06];
+  b1[0x06] += b1[0x05];
+  b1[0x05] += b1[0x07];
+
+  b1[0x08] = b2[0x08] + b2[0x09];
+  b1[0x09] = (b2[0x08] - b2[0x09]) * cos0;
+  b1[0x0A] = b2[0x0A] + b2[0x0B];
+  b1[0x0B] = (b2[0x0B] - b2[0x0A]) * cos0;
+  b1[0x0A] += b1[0x0B];
+
+  b1[0x0C] = b2[0x0C] + b2[0x0D];
+  b1[0x0D] = (b2[0x0C] - b2[0x0D]) * cos0;
+  b1[0x0E] = b2[0x0E] + b2[0x0F];
+  b1[0x0F] = (b2[0x0F] - b2[0x0E]) * cos0;
+  b1[0x0E] += b1[0x0F];
+  b1[0x0C] += b1[0x0E];
+  b1[0x0E] += b1[0x0D];
+  b1[0x0D] += b1[0x0F];
+
+  b1[0x10] = b2[0x10] + b2[0x11];
+  b1[0x11] = (b2[0x10] - b2[0x11]) * cos0;
+  b1[0x12] = b2[0x12] + b2[0x13];
+  b1[0x13] = (b2[0x13] - b2[0x12]) * cos0;
+  b1[0x12] += b1[0x13];
+
+  b1[0x14] = b2[0x14] + b2[0x15];
+  b1[0x15] = (b2[0x14] - b2[0x15]) * cos0;
+  b1[0x16] = b2[0x16] + b2[0x17];
+  b1[0x17] = (b2[0x17] - b2[0x16]) * cos0;
+  b1[0x16] += b1[0x17];
+  b1[0x14] += b1[0x16];
+  b1[0x16] += b1[0x15];
+  b1[0x15] += b1[0x17];
+
+  b1[0x18] = b2[0x18] + b2[0x19];
+  b1[0x19] = (b2[0x18] - b2[0x19]) * cos0;
+  b1[0x1A] = b2[0x1A] + b2[0x1B];
+  b1[0x1B] = (b2[0x1B] - b2[0x1A]) * cos0;
+  b1[0x1A] += b1[0x1B];
+
+  b1[0x1C] = b2[0x1C] + b2[0x1D];
+  b1[0x1D] = (b2[0x1C] - b2[0x1D]) * cos0;
+  b1[0x1E] = b2[0x1E] + b2[0x1F];
+  b1[0x1F] = (b2[0x1F] - b2[0x1E]) * cos0;
+  b1[0x1E] += b1[0x1F];
+  b1[0x1C] += b1[0x1E];
+  b1[0x1E] += b1[0x1D];
+  b1[0x1D] += b1[0x1F];
+ }
+
+ dest[ 0] = b1[0x00];
+ dest[ 4] = b1[0x04];
+ dest[ 8] = b1[0x02];
+ dest[12] = b1[0x06];
+// dest[0x10* 0] = b1[0x01];  /* I think this is wrong */
+ dest[16] = b1[0x01];
+ dest[20] = b1[0x05];
+ dest[24] = b1[0x03];
+ dest[28] = b1[0x07];
+
+ b1[0x08] += b1[0x0C];
+ dest[2] = b1[0x08];
+ b1[0x0C] += b1[0x0a];
+ dest[6] = b1[0x0C];
+ b1[0x0A] += b1[0x0E];
+ dest[10] = b1[0x0A];
+ b1[0x0E] += b1[0x09];
+ dest[14] = b1[0x0E];
+ b1[0x09] += b1[0x0D];
+ dest[18] = b1[0x09];
+ b1[0x0D] += b1[0x0B];
+ dest[22] = b1[0x0D];
+ b1[0x0B] += b1[0x0F];
+ dest[26] = b1[0x0B];
+ dest[30] = b1[0x0F];
+
+ b1[0x18] += b1[0x1C];
+ dest[1] = b1[0x10] + b1[0x18];
+ dest[3] = b1[0x18] + b1[0x14];
+ b1[0x1C] += b1[0x1a];
+ dest[5] = b1[0x14] + b1[0x1C];
+ dest[7] = b1[0x1C] + b1[0x12];
+ b1[0x1A] += b1[0x1E];
+ dest[9] = b1[0x12] + b1[0x1A];
+ dest[11] = b1[0x1A] + b1[0x16];
+ b1[0x1E] += b1[0x19];
+ dest[13] = b1[0x16] + b1[0x1E];
+ dest[15] = b1[0x1E] + b1[0x11];
+ b1[0x19] += b1[0x1D];
+ dest[17] = b1[0x11] + b1[0x19];
+ dest[19] = b1[0x19] + b1[0x15];
+ b1[0x1D] += b1[0x1B];
+ dest[21] = b1[0x15] + b1[0x1D];
+ dest[23] = b1[0x1D] + b1[0x13];
+ b1[0x1B] += b1[0x1F];
+ dest[25] = b1[0x13] + b1[0x1B];
+ dest[27] = b1[0x1B] + b1[0x17];
+ dest[29] = b1[0x17] + b1[0x1F];
+ dest[31] = b1[0x1F];
+}
+
+OIL_DEFINE_IMPL (imdct32_f32_mpglib, imdct32_f32_class);
+
+
+
+#ifdef TEST_imdct32_f32
+int TEST_imdct32_f32(void)
+{
+	int i;
+	int pass;
+	int failures = 0;
+	f32 *src, *dest_ref, *dest_test;
+	struct sl_profile_struct t;
+	double sad;
+	double sad_max = 0;
+	double sad_sum = 0;
+
+	src = sl_malloc_f32(32);
+	dest_ref = sl_malloc_f32(32);
+	dest_test = sl_malloc_f32(32);
+
+	sl_profile_init(t);
+	srand(20021001);
+
+	printf("I: " sl_stringify(imdct32_f32_FUNC) "\n");
+
+	for(pass=0;pass<N_PASS;pass++){
+		for(i=0;i<32;i++)src[i]=sl_rand_f32_0_1();
+
+		imdct32_f32_ref(dest_ref,src);
+		sl_profile_start(t);
+		imdct32_f32_FUNC(dest_test,src);
+		sl_profile_stop(t);
+
+		sad = 0;
+		for(i=0;i<32;i++){
+			sad += fabs(dest_test[i] - dest_ref[i]);
+		}
+		if(sad>sad_max)sad_max = sad;
+		sad_sum += sad;
+#if 0
+		if(sad>0){
+			printf("sad = %g\n",sad);
+		}
+#endif
+#if 0
+			if(dest_test[i] != dest_ref[i]){
+				printf("%d %g %g\n",i,dest_ref[i], dest_test[i]);
+			}
+#endif
+	}
+
+	printf("sad ave = %g\n",sad_sum/N_PASS);
+	printf("sad max = %g\n",sad_max);
+
+	sl_free(src);
+	sl_free(dest_ref);
+	sl_free(dest_test);
+
+	if(failures){
+		printf("E: %d failures\n",failures);
+	}
+
+	sl_profile_print(t);
+
+	return failures;
+}
+#endif
+
author	David Schleef <ds@schleef.org>	2004-09-03 21:39:10 +0000
committer	David Schleef <ds@schleef.org>	2004-09-03 21:39:10 +0000
commit	f735872cfb3fe56aa711b6af772bf7789ac0e377 (patch)
tree	ac42a34402fb902b4db098ef47a1bc312b6d72c1 /liboil/dct
parent	27d1241537a0974712dd7a7027dd94b0c76aeb30 (diff)
download	liboil-f735872cfb3fe56aa711b6af772bf7789ac0e377.tar.gz