summaryrefslogtreecommitdiff
path: root/liboil/dct
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2004-09-03 21:39:10 +0000
committerDavid Schleef <ds@schleef.org>2004-09-03 21:39:10 +0000
commitf735872cfb3fe56aa711b6af772bf7789ac0e377 (patch)
treeac42a34402fb902b4db098ef47a1bc312b6d72c1 /liboil/dct
parent27d1241537a0974712dd7a7027dd94b0c76aeb30 (diff)
downloadliboil-f735872cfb3fe56aa711b6af772bf7789ac0e377.tar.gz
Move a bunch of files around.
* configure.ac: * liboil/Makefile.am: * liboil/build_prototypes.c: (main): * liboil/copy/Makefile.am: * liboil/copy/copy.h: * liboil/copy/permute.c: * liboil/copy/splat_ref.c: (splat_u8_ref), (splat_u32_ref), (splat_u32_unroll2): * liboil/copy/tablelookup_ref.c: (tablelookup_u8_ref): * liboil/copy/trans8x8.c: (TEST_trans8x8_f64): * liboil/copy/trans8x8_f32.c: (trans8x8_f32_ref), (trans4x4_f32_a16_altivec), (trans8x8_f32_a16_altivec), (TEST_trans8x8_f32): * liboil/copy/trans8x8_s16.c: (trans8x8_s16_ref), (trans8x8_s16_a16_altivec), (trans8x8_s16_altivecwrap), (TEST_trans8x8_s16): * liboil/dct/Makefile.am: * liboil/dct/dct.h: * liboil/dct/dct12_f32.c: (dct12_f32_ref), (dct12_f32_ref1), (dct12_f32_mpglib), (TEST_dct12_f32): * liboil/dct/dct36.c: (dct36): * liboil/dct/dct36_f32.c: (dct36_f32_ref), (TEST_dct36_f32): * liboil/dct/fdct8_f64.c: (fdct8_f64_ref), (fdct8_f64_fast), (TEST_fdct8_f64): * liboil/dct/fdct8x8_f64.c: (fdct8x8_f64_ref), (fdct8x8_f64_ref2), (fdct8x8_f64_1d), (TEST_fdct8x8_f64): * liboil/dct/fdct8x8_s16.c: (fdct8x8_s16_ref), (TEST_fdct8x8_s16): * liboil/dct/fdct8x8s_s16.c: (fdct8x8s_s16_ref), (TEST_fdct8x8s_s16): * liboil/dct/idct8_f64.c: (idct8_f64_ref), (idct8_f64_fastx), (TEST_idct8_f64): * liboil/dct/idct8x8_c.c: (idct8x8_f64_slow), (idct8x8_f64_c), (idct8x8_s16_slow): * liboil/dct/idct8x8_f64.c: (idct8x8_f64_ref), (idct8x8_f64_ref2), (idct8x8_f64_1d), (TEST_idct8x8_f64): * liboil/dct/idct8x8_s16.c: (idct8x8_s16_ref), (idct8x8_s16_fast), (TEST_idct8x8_s16): * liboil/dct/idct8x8s_s16.c: (idct8x8s_s16_ref), (TEST_idct8x8s_s16): * liboil/dct/imdct32_f32.c: (imdct32_f32_ref), (imdct32_f32_mpglib), (TEST_imdct32_f32): * liboil/jpeg/Makefile.am: * liboil/jpeg/idct8_c.c: * liboil/jpeg/idct8x8_c.c: * liboil/jpeg/jpeg.c: * liboil/jpeg/jpeg.h: * liboil/jpeg/jpeg_rgb_decoder.c: * liboil/jpeg/quantize8x8_c.c: * liboil/jpeg/yuv2rgb_c.c: * liboil/jpeg/zigzag8x8_c.c: * liboil/liboilcpu.c: (oil_cpu_i386_getflags): * liboil/liboildebug.c: (oil_debug_print_valist): * liboil/liboilfuncs.h: * liboil/liboilfunction.c: (oil_class_get_by_index), (oil_class_optimize), (oil_init_pointers), (oil_init_structs): * liboil/liboilfunction.h: * liboil/simdpack/Makefile.am: * liboil/simdpack/abs.c: (abs_u8_s8_ref), (abs_u16_s16_ref), (abs_u32_s32_ref): * liboil/simdpack/abs_u32_s32.c: * liboil/simdpack/average2_u8.c: * liboil/simdpack/clip_ref.c: * liboil/simdpack/dct12_f32.c: * liboil/simdpack/dct36.c: * liboil/simdpack/dct36_f32.c: * liboil/simdpack/diffsquaresum_f64.c: * liboil/simdpack/downsample1x_f64.c: * liboil/simdpack/fdct8_f64.c: * liboil/simdpack/fdct8x8_f64.c: * liboil/simdpack/fdct8x8_s16.c: * liboil/simdpack/fdct8x8s_s16.c: * liboil/simdpack/get8x8_f64.c: * liboil/simdpack/idct8_f64.c: * liboil/simdpack/idct8x8_f64.c: * liboil/simdpack/idct8x8_s16.c: * liboil/simdpack/idct8x8s_s16.c: * liboil/simdpack/imdct32_f32.c: * liboil/simdpack/mix_u8.c: * liboil/simdpack/mult8x8_s16.c: * liboil/simdpack/multsum.c: * liboil/simdpack/permute.c: * liboil/simdpack/sad8x8.c: * liboil/simdpack/scalaradd.c: * liboil/simdpack/simdpack.c: * liboil/simdpack/sincos_f64.c: * liboil/simdpack/squaresum_f64.c: * liboil/simdpack/sum_f64.c: * liboil/simdpack/trans8x8.c: * liboil/simdpack/trans8x8_f32.c: * liboil/simdpack/trans8x8_s16.c: * liboil/simdpack/vectoradd_f64.c: * liboil/simdpack/zigzag8x8_s16.c: * testsuite/Makefile.am: * testsuite/abs.c: (test), (main): * testsuite/introspect.c: (main):
Diffstat (limited to 'liboil/dct')
-rw-r--r--liboil/dct/Makefile.am23
-rw-r--r--liboil/dct/dct.h33
-rw-r--r--liboil/dct/dct12_f32.c283
-rw-r--r--liboil/dct/dct36.c114
-rw-r--r--liboil/dct/dct36_f32.c86
-rw-r--r--liboil/dct/fdct8_f64.c236
-rw-r--r--liboil/dct/fdct8x8_f64.c192
-rw-r--r--liboil/dct/fdct8x8_s16.c126
-rw-r--r--liboil/dct/fdct8x8s_s16.c138
-rw-r--r--liboil/dct/idct8_f64.c172
-rw-r--r--liboil/dct/idct8x8_c.c118
-rw-r--r--liboil/dct/idct8x8_f64.c206
-rw-r--r--liboil/dct/idct8x8_s16.c144
-rw-r--r--liboil/dct/idct8x8s_s16.c148
-rw-r--r--liboil/dct/imdct32_f32.c438
15 files changed, 2457 insertions, 0 deletions
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am
new file mode 100644
index 0000000..5c223a4
--- /dev/null
+++ b/liboil/dct/Makefile.am
@@ -0,0 +1,23 @@
+
+noinst_LTLIBRARIES = libdct.la
+
+libdct_la_SOURCES = \
+ dct12_f32.c \
+ dct36_f32.c \
+ fdct8_f64.c \
+ idct8_f64.c \
+ idct8x8_c.c \
+ imdct32_f32.c
+
+# fdct8x8_f64.c
+# fdct8x8s_s16.c
+# idct8x8_f64.c
+# idct8x8_s16.c
+# idct8x8s_s16.c
+
+noinst_HEADERS = \
+ dct.h
+
+libdct_la_CFLAGS = $(LIBOIL_CFLAGS)
+
+
diff --git a/liboil/dct/dct.h b/liboil/dct/dct.h
new file mode 100644
index 0000000..04435a1
--- /dev/null
+++ b/liboil/dct/dct.h
@@ -0,0 +1,33 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifndef _LIBOIL_DCT_H_
+#define _LIBOIL_DCT_H_
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS(dct12_f32);
+OIL_DECLARE_CLASS(dct36_f32);
+OIL_DECLARE_CLASS(fdct8_f64);
+OIL_DECLARE_CLASS(fdct8x8_f64);
+OIL_DECLARE_CLASS(fdct8x8s_s16);
+OIL_DECLARE_CLASS(idct8_f64);
+OIL_DECLARE_CLASS(imdct32_f32);
+
+#endif
+
diff --git a/liboil/dct/dct12_f32.c b/liboil/dct/dct12_f32.c
new file mode 100644
index 0000000..e486846
--- /dev/null
+++ b/liboil/dct/dct12_f32.c
@@ -0,0 +1,283 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+
+OIL_DEFINE_CLASS_X (dct12_f32, "float *dest, int dstr, float *src, int sstr");
+
+static void
+dct12_f32_ref (float *dest, int dstr, float *src, int sstr)
+{
+ float in0,in1,in2,in3,in4,in5;
+ float *out2 = dest;
+ float *in = src;
+ float COS6_2 = cos(M_PI / 6.0 * 2.0);
+ float COS6_1 = cos(M_PI / 6.0 * 1.0);
+ float tfcos12[3];
+ float wi[12];
+ int i;
+
+ for(i=0;i<3;i++){
+ tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 );
+ }
+ for(i=0;i<12;i++){
+ wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 );
+ }
+
+ out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0;
+
+ in5 = in[5*3];
+ in5 += (in4 = in[4*3]);
+ in4 += (in3 = in[3*3]);
+ in3 += (in2 = in[2*3]);
+ in2 += (in1 = in[1*3]);
+ in1 += (in0 = in[0*3]);
+
+ in5 += in3; in3 += in1;
+
+ in2 *= COS6_1;
+ in3 *= COS6_1;
+
+ {
+ float tmp0,tmp1 = (in0 - in4);
+ {
+ float tmp2 = (in1 - in5) * tfcos12[1];
+ tmp0 = tmp1 + tmp2;
+ tmp1 -= tmp2;
+ }
+ out2[11-1] = tmp0 * wi[11-1];
+ out2[6 +1] = tmp0 * wi[6+1];
+ out2[0+1] += tmp1 * wi[1];
+ out2[5-1] += tmp1 * wi[5-1];
+ }
+
+ in0 += in4 * COS6_2;
+
+ in4 = in0 + in2;
+ in0 -= in2;
+
+ in1 += in5 * COS6_2;
+
+ in5 = (in1 + in3) * tfcos12[0];
+ in1 = (in1 - in3) * tfcos12[2];
+
+ in3 = in4 + in5;
+ in4 -= in5;
+
+ in2 = in0 + in1;
+ in0 -= in1;
+
+ out2[11-0] = in2 * wi[11-0];
+ out2[6 +0] = in2 * wi[6+0];
+ out2[6 +2] = in3 * wi[6+2];
+ out2[11-2] = in3 * wi[11-2];
+
+ out2[0+0] += in0 * wi[0];
+ out2[5-0] += in0 * wi[5-0];
+ out2[0+2] += in4 * wi[2];
+ out2[5-2] += in4 * wi[5-2];
+}
+
+OIL_DEFINE_IMPL_REF (dct12_f32_ref, dct12_f32_class);
+
+
+static void
+dct12_f32_ref1(float *dest, int dstr, float *src, int sstr)
+{
+ int l,m,k;
+ double x;
+ double coeff;
+ double cos_s;
+ double wi[36];
+ int i;
+
+ for(i=0;i<12;i++){
+ wi[i] = sin( M_PI / 12.0 * (i+0.5));
+ }
+ for(;i<36;i++){
+ wi[i] = 0;
+ }
+
+ for(l=0;l<3;l++){
+ for(m=0;m<6;m++){
+ x = 0;
+ for(k=0;k<12;k++){
+ cos_s = cos( (M_PI / 24.0) * (2 * k + 7) *
+ (2 * m + 1) ) / 3;
+ coeff = wi[k] * cos_s;
+ x += coeff * src[k + 6*l + 6];
+ }
+ dest[3*m + l] = x;
+ }
+ }
+}
+OIL_DEFINE_IMPL (dct12_f32_ref1, dct12_f32_class);
+
+/* copyright: from mpglib */
+/*
+ * new DCT12
+ */
+static void
+dct12_f32_mpglib(float *dest, int dstr, float *src, int sstr)
+{
+ float in0,in1,in2,in3,in4,in5;
+ float *out2 = dest;
+ float *in = src;
+ float wi[12];
+ int i;
+ float tmp0, tmp1, tmp2;
+
+ float COS6_2 = cos(M_PI / 6.0 * 2.0);
+ float COS6_1 = cos(M_PI / 6.0 * 1.0);
+ float tfcos12[3];
+
+ for(i=0;i<3;i++){
+ tfcos12[i] = 0.5 / cos ( M_PI * (i*2.0+1.0) / 12.0 );
+ }
+ for(i=0;i<12;i++){
+ wi[i] = 0.5 * sin( M_PI / 24.0 * (double) (2*i+1) ) / cos ( M_PI * (double) (2*i+7) / 24.0 );
+ }
+
+ out2[12]=out2[13]=out2[14]=out2[15]=out2[16]=out2[17]=0.0;
+
+ in5 = in[5*3] + in[4*3];
+ in4 = in[4*3] + in[3*3];
+ in3 = in[3*3] + in[2*3];
+ in2 = in[2*3] + in[1*3];
+ in1 = in[1*3] + in[0*3];
+ in0 = in[0*3];
+
+ //in5 += in3;
+ in5 = in[5*3] + in[4*3] + in[3*3] + in[2*3];
+ //in3 += in1;
+ in3 = in[3*3] + in[2*3] + in[1*3] + in[0*3];
+
+ //in2 *= COS6_1;
+ in2 = COS6_1 * (in[2*3] + in[1*3]);
+
+ //in3 *= COS6_1;
+ in3 = COS6_1 * (in[3*3] + in[2*3] + in[1*3] + in[0*3]);
+
+ //tmp1 = (in0 - in4);
+ tmp1 = in[0*3] - in[4*3] - in[3*3];
+
+ //tmp2 = (in1 - in5) * tfcos12[1];
+ tmp2 = tfcos12[1] * (in[1*3] + in[0*3] - in[5*3] - in[4*3] - in[3*3] - in[2*3]);
+
+ //tmp0 = tmp1 + tmp2;
+ tmp0 = (1 + tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) +
+ tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]);
+
+ //tmp1 -= tmp2;
+ tmp1 = (1 - tfcos12[1]) * (in[0*3] - in[4*3] - in[3*3]) -
+ tfcos12[1] * (in[1*3] - in[5*3] - in[2*3]);
+
+ out2[11-1] = tmp0 * wi[11-1];
+ out2[6 +1] = tmp0 * wi[6+1];
+ out2[0+1] += tmp1 * wi[1];
+ out2[5-1] += tmp1 * wi[5-1];
+
+ in0 += in4 * COS6_2;
+
+ in4 = in0 + in2;
+ in0 -= in2;
+
+ in1 += in5 * COS6_2;
+
+ in5 = (in1 + in3) * tfcos12[0];
+ in1 = (in1 - in3) * tfcos12[2];
+
+ in3 = in4 + in5;
+ in4 = in4 - in5;
+
+ in2 = in0 + in1;
+ in0 = in0 - in1;
+
+ out2[11-0] = in2 * wi[11-0];
+ out2[6 +0] = in2 * wi[6+0];
+ out2[6 +2] = in3 * wi[6+2];
+ out2[11-2] = in3 * wi[11-2];
+
+ out2[0+0] += in0 * wi[0];
+ out2[5-0] += in0 * wi[5-0];
+ out2[0+2] += in4 * wi[2];
+ out2[5-2] += in4 * wi[5-2];
+}
+
+OIL_DEFINE_IMPL (dct12_f32_mpglib, dct12_f32_class);
+
+
+
+#ifdef TEST_dct12_f32
+int TEST_dct12_f32(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ float *src, *dest_ref, *dest_test;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f32(18);
+ dest_ref = sl_malloc_f32(18);
+ dest_test = sl_malloc_f32(18);
+
+ sl_profile_init(t);
+ srand(20021001);
+
+ printf("I: " sl_stringify(dct12_f32_FUNC) "\n");
+
+ for(pass=0;pass<1;pass++){
+ for(i=0;i<18;i++){
+ src[i]=sl_rand_f32_0_1();
+ //src[i]=(i==pass+6);
+ }
+
+ dct12_f32_ref(dest_ref,src);
+ sl_profile_start(t);
+ dct12_f32_FUNC(dest_test,src);
+ sl_profile_stop(t);
+
+ for(i=0;i<18;i++){
+ if(dest_test[i] != dest_ref[i]){
+ printf("%d %g %g %g\n",i,src[i],dest_ref[i],
+ dest_test[i]);
+ }
+ }
+ }
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/dct36.c b/liboil/dct/dct36.c
new file mode 100644
index 0000000..9cfc885
--- /dev/null
+++ b/liboil/dct/dct36.c
@@ -0,0 +1,114 @@
+/*
+ DCT insipired by Jeff Tsay's DCT from the maplay package
+ this is an optimized version with manual unroll.
+
+ References:
+ [1] S. Winograd: "On Computing the Discrete Fourier Transform",
+ Mathematics of Computation, Volume 32, Number 141, January 1978,
+ Pages 175-199
+*/
+
+static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf)
+{
+ {
+ real *in = inbuf;
+
+ in[17]+=in[16]; in[16]+=in[15]; in[15]+=in[14];
+ in[14]+=in[13]; in[13]+=in[12]; in[12]+=in[11];
+ in[11]+=in[10]; in[10]+=in[9]; in[9] +=in[8];
+ in[8] +=in[7]; in[7] +=in[6]; in[6] +=in[5];
+ in[5] +=in[4]; in[4] +=in[3]; in[3] +=in[2];
+ in[2] +=in[1]; in[1] +=in[0];
+
+ in[17]+=in[15]; in[15]+=in[13]; in[13]+=in[11]; in[11]+=in[9];
+ in[9] +=in[7]; in[7] +=in[5]; in[5] +=in[3]; in[3] +=in[1];
+
+
+ {
+
+#define MACRO0(v) { \
+ real tmp; \
+ out2[9+(v)] = (tmp = sum0 + sum1) * w[27+(v)]; \
+ out2[8-(v)] = tmp * w[26-(v)]; } \
+ sum0 -= sum1; \
+ ts[SBLIMIT*(8-(v))] = out1[8-(v)] + sum0 * w[8-(v)]; \
+ ts[SBLIMIT*(9+(v))] = out1[9+(v)] + sum0 * w[9+(v)];
+#define MACRO1(v) { \
+ real sum0,sum1; \
+ sum0 = tmp1a + tmp2a; \
+ sum1 = (tmp1b + tmp2b) * tfcos36[(v)]; \
+ MACRO0(v); }
+#define MACRO2(v) { \
+ real sum0,sum1; \
+ sum0 = tmp2a - tmp1a; \
+ sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
+ MACRO0(v); }
+
+ const real *c = COS9;
+ real *out2 = o2;
+ real *w = wintab;
+ real *out1 = o1;
+ real *ts = tsbuf;
+
+ real ta33,ta66,tb33,tb66;
+
+ ta33 = in[2*3+0] * c[3];
+ ta66 = in[2*6+0] * c[6];
+ tb33 = in[2*3+1] * c[3];
+ tb66 = in[2*6+1] * c[6];
+
+ {
+ real tmp1a,tmp2a,tmp1b,tmp2b;
+ tmp1a = in[2*1+0] * c[1] + ta33 + in[2*5+0] * c[5] + in[2*7+0] * c[7];
+ tmp1b = in[2*1+1] * c[1] + tb33 + in[2*5+1] * c[5] + in[2*7+1] * c[7];
+ tmp2a = in[2*0+0] + in[2*2+0] * c[2] + in[2*4+0] * c[4] + ta66 + in[2*8+0] * c[8];
+ tmp2b = in[2*0+1] + in[2*2+1] * c[2] + in[2*4+1] * c[4] + tb66 + in[2*8+1] * c[8];
+
+ MACRO1(0);
+ MACRO2(8);
+ }
+
+ {
+ real tmp1a,tmp2a,tmp1b,tmp2b;
+ tmp1a = ( in[2*1+0] - in[2*5+0] - in[2*7+0] ) * c[3];
+ tmp1b = ( in[2*1+1] - in[2*5+1] - in[2*7+1] ) * c[3];
+ tmp2a = ( in[2*2+0] - in[2*4+0] - in[2*8+0] ) * c[6] - in[2*6+0] + in[2*0+0];
+ tmp2b = ( in[2*2+1] - in[2*4+1] - in[2*8+1] ) * c[6] - in[2*6+1] + in[2*0+1];
+
+ MACRO1(1);
+ MACRO2(7);
+ }
+
+ {
+ real tmp1a,tmp2a,tmp1b,tmp2b;
+ tmp1a = in[2*1+0] * c[5] - ta33 - in[2*5+0] * c[7] + in[2*7+0] * c[1];
+ tmp1b = in[2*1+1] * c[5] - tb33 - in[2*5+1] * c[7] + in[2*7+1] * c[1];
+ tmp2a = in[2*0+0] - in[2*2+0] * c[8] - in[2*4+0] * c[2] + ta66 + in[2*8+0] * c[4];
+ tmp2b = in[2*0+1] - in[2*2+1] * c[8] - in[2*4+1] * c[2] + tb66 + in[2*8+1] * c[4];
+
+ MACRO1(2);
+ MACRO2(6);
+ }
+
+ {
+ real tmp1a,tmp2a,tmp1b,tmp2b;
+ tmp1a = in[2*1+0] * c[7] - ta33 + in[2*5+0] * c[1] - in[2*7+0] * c[5];
+ tmp1b = in[2*1+1] * c[7] - tb33 + in[2*5+1] * c[1] - in[2*7+1] * c[5];
+ tmp2a = in[2*0+0] - in[2*2+0] * c[4] + in[2*4+0] * c[8] + ta66 - in[2*8+0] * c[2];
+ tmp2b = in[2*0+1] - in[2*2+1] * c[4] + in[2*4+1] * c[8] + tb66 - in[2*8+1] * c[2];
+
+ MACRO1(3);
+ MACRO2(5);
+ }
+
+ {
+ real sum0,sum1;
+ sum0 = in[2*0+0] - in[2*2+0] + in[2*4+0] - in[2*6+0] + in[2*8+0];
+ sum1 = (in[2*0+1] - in[2*2+1] + in[2*4+1] - in[2*6+1] + in[2*8+1] ) * tfcos36[4];
+ MACRO0(4);
+ }
+ }
+
+ }
+}
+
diff --git a/liboil/dct/dct36_f32.c b/liboil/dct/dct36_f32.c
new file mode 100644
index 0000000..061c84d
--- /dev/null
+++ b/liboil/dct/dct36_f32.c
@@ -0,0 +1,86 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+
+OIL_DEFINE_CLASS_X(dct36_f32, "float *dest, int dstr, float *src, int sstr, int n");
+
+static void
+dct36_f32_ref(float *dest, int dstr, float *src, int sstr, int n)
+{
+ int i;
+ for(i=0;i<n;i++){
+ dest[i] = src[i];
+ }
+}
+
+OIL_DEFINE_IMPL_REF (dct36_f32_ref, dct36_f32_class);
+
+#ifdef TEST_dct36_f32
+int TEST_dct36_f32(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ f64 *src, *dest_ref, *dest_test;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f64(N);
+ dest_ref = sl_malloc_f64(N);
+ dest_test = sl_malloc_f64(N);
+
+ sl_profile_init(t);
+ srand(20021001);
+
+ printf("I: " sl_stringify(dct36_f32_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<N;i++)src[i]=sl_rand_f64_s16();
+
+ dct36_f32_ref(dest_ref,src,N);
+ sl_profile_start(t);
+ dct36_f32_FUNC(dest_test,src,N);
+ sl_profile_stop(t);
+
+ for(i=0;i<N;i++){
+ if(dest_test[i] != dest_ref[i]){
+ printf("%d %g %g %g\n",i,src[i],dest_ref[i],
+ dest_test[i]);
+ }
+ }
+ }
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8_f64.c b/liboil/dct/fdct8_f64.c
new file mode 100644
index 0000000..3fc93a0
--- /dev/null
+++ b/liboil/dct/fdct8_f64.c
@@ -0,0 +1,236 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (fdct8_f64, "double *dest, double *src, int dstr, int sstr");
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+static void
+fdct8_f64_ref (double *dest, double *src, int dstr, int sstr)
+{
+ static double fdct_coeff[8][8];
+ static int fdct_coeff_init = 0;
+ int i,j;
+ double x;
+
+ if(!fdct_coeff_init){
+ double scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ fdct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ fdct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ x = 0;
+ for(j=0;j<8;j++){
+ x += fdct_coeff[j][i] * OIL_GET(src,sstr*j, double);
+ }
+ OIL_GET(dest,dstr*i, double) = x;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (fdct8_f64_ref, fdct8_f64_class);
+
+/*
+ * This algorithm is roughly similar to a Fast-Fourier transform,
+ * taking advantage of the symmeties of the base vectors. For
+ * reference, the base vectors are (horizontally):
+ *
+ * 0: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
+ * 1: 0.9808 0.8315 0.5556 0.1951 -0.1951 -0.5556 -0.8315 -0.9808
+ * 2: 0.9239 0.3827 -0.3827 -0.9239 -0.9239 -0.3827 0.3827 0.9239
+ * 3: 0.8315 -0.1951 -0.9808 -0.5556 0.5556 0.9808 0.1951 -0.8315
+ * 4: 0.7071 -0.7071 -0.7071 0.7071 0.7071 -0.7071 -0.7071 0.7071
+ * 5: 0.5556 -0.9808 0.1951 0.8315 -0.8315 -0.1951 0.9808 -0.5556
+ * 6: 0.3827 -0.9239 0.9239 -0.3827 -0.3827 0.9239 -0.9239 0.3827
+ * 7: 0.1951 -0.5556 0.8315 -0.9808 0.9808 -0.8315 0.5556 -0.1951
+ *
+ * The symmetries of note:
+ * - even vectors are symmetric around 4 (the middle)
+ * - odd vectors are antisymmetric around 4
+ * - 0,4 are symmertic around 2 and 6
+ * - 2,6 are antisymmetic around 2 and 6
+ *
+ * f0 = (x0 + x7) + (x1 + x6) + (x2 + x5) + (x3 + x4);
+ * f1 = 0.9808*(x0 - x7) + 0.8315*(x1 - x6) + 0.5556*(x2 - x5) + 0.1951*(x3 - x4)
+ * f2 = 0.9239*(x0 + x7) + 0.3827*(x1 + x6) - 0.3827*(x2 + x5) - 0.9239*(x3 + x4)
+ * f3 = 0.8315*(x0 - x7) - 0.1951*(x1 - x6) - 0.9808*(x2 - x5) - 0.5556*(x3 - x4)
+ * f4 = 0.7071*((x0 + x7) - (x1 + x6) - (x2 + x5) + (x3 + x4))
+ * f5 = 0.5556*(x0 - x7) - 0.9808*(x1 - x6) + 0.1951*(x2 - x5) + 0.8315*(x3 - x4)
+ * f6 = 0.3827*(x0 + x7) - 0.9239*(x1 + x6) + 0.9239*(x2 + x5) - 0.3827*(x3 + x4)
+ * f7 = 0.1951*(x0 - x7) - 0.5556*(x1 - x6) + 0.8315*(x2 - x5) - 0.9808*(x3 - x4)
+ *
+ * The even vectors can be further simplified:
+ *
+ * f0 = ((x0 + x7) + (x3 + x4)) + ((x1 + x6) + (x2 + x5));
+ * f2 = 0.9239*((x0 + x7) - (x3 + x4)) + 0.3827*((x1 + x6) - (x2 + x5))
+ * f4 = 0.7071*((x0 + x7) + (x3 + x4)) - 0.7071*((x1 + x6) + (x2 + x5))
+ * f6 = 0.3827*((x0 + x7) - (x3 + x4)) - 0.9239*((x1 + x6) - (x2 + x5))
+ *
+ * Some implementations move some of the normalization to a later
+ * stage of processing, saving a few multiplies which get absorbed
+ * into later multiplies. However, this incurs a bit of error in
+ * the integer versions of this function. Also, if the CPU has a
+ * multiply-and-add function, you don't gain anything.
+ */
+
+static void
+fdct8_f64_fast(double *dest, double *src, int dstr, int sstr)
+{
+ double s07, s16, s25, s34;
+ double d07, d16, d25, d34;
+ double ss07s34, ds07s34, ss16s25, ds16s25;
+
+ s07 = OIL_GET(src,sstr*0,double) + OIL_GET(src,sstr*7,double);
+ d07 = OIL_GET(src,sstr*0,double) - OIL_GET(src,sstr*7,double);
+ s16 = OIL_GET(src,sstr*1,double) + OIL_GET(src,sstr*6,double);
+ d16 = OIL_GET(src,sstr*1,double) - OIL_GET(src,sstr*6,double);
+ s25 = OIL_GET(src,sstr*2,double) + OIL_GET(src,sstr*5,double);
+ d25 = OIL_GET(src,sstr*2,double) - OIL_GET(src,sstr*5,double);
+ s34 = OIL_GET(src,sstr*3,double) + OIL_GET(src,sstr*4,double);
+ d34 = OIL_GET(src,sstr*3,double) - OIL_GET(src,sstr*4,double);
+
+ ss07s34 = s07 + s34;
+ ds07s34 = s07 - s34;
+ ss16s25 = s16 + s25;
+ ds16s25 = s16 - s25;
+
+ OIL_GET(dest,dstr*0,double) = 0.5*C0_7071*(ss07s34 + ss16s25);
+ OIL_GET(dest,dstr*2,double) = 0.5*(C0_9239*ds07s34 + C0_3827*ds16s25);
+ OIL_GET(dest,dstr*4,double) = 0.5*C0_7071*(ss07s34 - ss16s25);
+ OIL_GET(dest,dstr*6,double) = 0.5*(C0_3827*ds07s34 - C0_9239*ds16s25);
+
+ OIL_GET(dest,dstr*1,double) = 0.5*(C0_9808*d07 + C0_8315*d16
+ + C0_5556*d25 + C0_1951*d34);
+ OIL_GET(dest,dstr*3,double) = 0.5*(C0_8315*d07 - C0_1951*d16
+ - C0_9808*d25 - C0_5556*d34);
+ OIL_GET(dest,dstr*5,double) = 0.5*(C0_5556*d07 - C0_9808*d16
+ + C0_1951*d25 + C0_8315*d34);
+ OIL_GET(dest,dstr*7,double) = 0.5*(C0_1951*d07 - C0_5556*d16
+ + C0_8315*d25 - C0_9808*d34);
+
+#if 0
+ z1 = (ds1 tmp12 + tmp13) * 0.707106781;
+ OIL_GET(dest,dstr*2,double) = (tmp13 + z1)*(0.25*M_SQRT2)*0.765366864;
+ OIL_GET(dest,dstr*6,double) = (tmp13 - z1)*(0.25*M_SQRT2)*1.847759066;
+
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = (tmp10 - tmp12) * 0.382683433;
+ z2 = 0.541196100 * tmp10 + z5;
+ z4 = 1.306562965 * tmp12 + z5;
+ z3 = tmp11 * 0.707106781;
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ OIL_GET(dest,dstr*5,double) = (z13 + z2)*(0.25*M_SQRT2)*1.2728;
+ OIL_GET(dest,dstr*3,double) = (z13 - z2)*(0.25*M_SQRT2)*0.8504;
+ OIL_GET(dest,dstr*1,double) = (z11 + z4)*(0.25*M_SQRT2)*0.7209;
+ OIL_GET(dest,dstr*7,double) = (z11 - z4)*(0.25*M_SQRT2)*3.6245;
+#endif
+}
+OIL_DEFINE_IMPL (fdct8_f64_fast, fdct8_f64_class);
+
+
+
+
+#ifdef TEST_fdct8_f64
+int TEST_fdct8_f64(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ double *src, *dest_ref, *dest_test;
+ double sad;
+ double sad_sum;
+ double sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f64(8);
+ dest_ref = sl_malloc_f64(8);
+ dest_test = sl_malloc_f64(8);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(fdct8_f64_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1();
+
+ //block8_dump(src);
+
+ fdct8_f64_ref(dest_test, src, 8, 8);
+ //block8_dump(dest_test);
+
+ sl_profile_start(t);
+ fdct8_f64_FUNC(dest_ref, src, 8, 8);
+ sl_profile_stop(t);
+ //block8_dump(dest_ref);
+
+ sad = 0;
+ for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 1.0){
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",sad_sum/N_PASS);
+ printf("sad max: %g\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8_f64.c b/liboil/dct/fdct8x8_f64.c
new file mode 100644
index 0000000..e59b455
--- /dev/null
+++ b/liboil/dct/fdct8x8_f64.c
@@ -0,0 +1,192 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilfuncs.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+
+OIL_DEFINE_CLASS(fdct8x8_f64, NULL);
+
+
+static void fdct8x8_f64_ref(double *dest, int dstr, double *src, int sstr)
+{
+ static double fdct_coeff[8][8];
+ static int fdct_coeff_init = 0;
+ int i,j,k,l;
+ double tmp1,tmp2;
+
+ if(!fdct_coeff_init){
+ double scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ fdct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ fdct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ tmp1 = 0;
+ for(k=0;k<8;k++){
+ tmp2 = 0;
+ for(l=0;l<8;l++){
+ tmp2 += fdct_coeff[l][j] *
+ OIL_GET (src, sstr*k + l, double);
+ }
+ tmp1 += fdct_coeff[k][i] * tmp2;
+ }
+ OIL_GET (dest, dstr*i + j, double) = tmp1;
+ }
+ }
+}
+
+OIL_DEFINE_IMPL_REF (fdct8x8_f64_ref, fdct8x8_f64_class);
+
+static void
+fdct8x8_f64_ref2(double *dest, int dstr, double *src, int sstr)
+{
+ static double fdct_coeff[8][8];
+ static int fdct_coeff_init = 0;
+ int i,j,k;
+ double x;
+ double tmp[64];
+
+ if(!fdct_coeff_init){
+ double scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ fdct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ fdct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ x = 0;
+ for(k=0;k<8;k++){
+ x += fdct_coeff[k][j] *
+ OIL_GET (src, sstr*i + k, double);
+ }
+ tmp[8*i+j] = x;
+ }
+ }
+
+ for(j=0;j<8;j++){
+ for(i=0;i<8;i++){
+ x = 0;
+ for(k=0;k<8;k++){
+ x += fdct_coeff[k][i] * tmp[8*k + j];
+ }
+ OIL_GET (dest,dstr*i+j, double) = x;
+ }
+ }
+}
+
+OIL_DEFINE_IMPL (fdct8x8_f64_ref2, fdct8x8_f64_class);
+
+
+static void
+fdct8x8_f64_1d (double *dest, int dstr, double *src, int sstr)
+{
+ int i;
+ double tmp[64];
+
+ for(i=0;i<8;i++){
+ fdct8_f64(tmp + i*8, sizeof(double), OIL_OFFSET(src,sstr*i),
+ sizeof(double));
+ }
+
+ for(i=0;i<8;i++){
+ fdct8_f64(dest + i, dstr, tmp + i, 8*sizeof(double));
+ }
+}
+
+OIL_DEFINE_IMPL (fdct8x8_f64_1d, fdct8x8_f64_class);
+
+
+#ifdef TEST_fdct8x8_f64
+int TEST_fdct8x8_f64(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ double *src, *dest_ref, *dest_test;
+ double sad;
+ double sad_sum;
+ double sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f64(64);
+ dest_ref = sl_malloc_f64(64);
+ dest_test = sl_malloc_f64(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(fdct8x8_f64_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1();
+
+ fdct8x8_f64_ref(dest_test, src, 8, 8);
+ sl_profile_start(t);
+ fdct8x8_f64_FUNC(dest_ref, src, 8, 8);
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 1.0){
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",sad_sum/N_PASS);
+ printf("sad max: %g\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8_s16.c b/liboil/dct/fdct8x8_s16.c
new file mode 100644
index 0000000..4b8f045
--- /dev/null
+++ b/liboil/dct/fdct8x8_s16.c
@@ -0,0 +1,126 @@
+/* forward discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: fdct8x8_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _fdct8x8_s16_h_
+#define _fdct8x8_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_fdct8x8_s16_storage
+ #ifdef SL_storage
+ #define SL_fdct8x8_s16_storage SL_storage
+ #else
+ #define SL_fdct8x8_s16_storage static inline
+ #endif
+#endif
+
+
+
+#include <fdct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL fdct8x8_s16_ref */
+SL_fdct8x8_s16_storage
+void fdct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+ f64 s[64], d[64];
+ int i,j;
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ block8x8_f64(s,8*sizeof(f64),i,j) =
+ block8x8_s16(src,sstr,i,j);
+ }
+ }
+
+ fdct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+ conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_fdct8x8_s16
+int TEST_fdct8x8_s16(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ s16 *src, *dest_ref, *dest_test;
+ u32 sad;
+ u32 sad_sum;
+ u32 sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_s16(64);
+ dest_ref = sl_malloc_s16(64);
+ dest_test = sl_malloc_s16(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(fdct8x8_s16_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+ fdct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_start(t);
+ fdct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 64){
+ block8x8_dump_s16(src, 8*sizeof(s16));
+ block8x8_dump_s16(dest_test, 8*sizeof(s16));
+ block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+ printf("sad max: %d\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/fdct8x8s_s16.c b/liboil/dct/fdct8x8s_s16.c
new file mode 100644
index 0000000..4176298
--- /dev/null
+++ b/liboil/dct/fdct8x8s_s16.c
@@ -0,0 +1,138 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+
+
+OIL_DEFINE_CLASS(fdct8x8s_s16, NULL);
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+/*
+Alternate scaling used by RTjpeg.
+*/
+
+
+
+static void
+fdct8x8s_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+ double s[64], d[64];
+ double scale[8] = {
+ 4*C0_7071,
+ 4*C0_9808,
+ 4*C0_9239,
+ 4*C0_8315,
+ 4*C0_7071,
+ 4*C0_5556,
+ 4*C0_3827,
+ 4*C0_1951,
+ };
+ int i,j;
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ s[8*i+j] = OIL_GET (src,sstr*i+j, int16_t);
+ }
+ }
+
+ fdct8x8_f64(d,8*sizeof(double),s,8*sizeof(double));
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ d[8*i+j] *= scale[i] * scale[j];
+ }
+ }
+
+ conv8x8_s16_f64(dest,dstr,d,8*sizeof(double));
+}
+
+OIL_DEFINE_IMPL_REF (fdct8x8s_s16_ref, fdct8x8s_s16_class);
+
+#ifdef TEST_fdct8x8s_s16
+int TEST_fdct8x8s_s16(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ int16_t *src, *dest_ref, *dest_test;
+ u32 sad;
+ u32 sad_sum;
+ u32 sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_s16(64);
+ dest_ref = sl_malloc_s16(64);
+ dest_test = sl_malloc_s16(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(fdct8x8s_s16_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+ fdct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_start(t);
+ fdct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 128){
+ //block8x8_dump_s16(src, 8*sizeof(s16));
+ //block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+ //block8x8_dump_s16(dest_test, 8*sizeof(s16));
+ //block8x8_dumpdiff_s16(dest_test, dest_ref, 8*sizeof(s16));
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+ printf("sad max: %d\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8_f64.c b/liboil/dct/idct8_f64.c
new file mode 100644
index 0000000..fa3a952
--- /dev/null
+++ b/liboil/dct/idct8_f64.c
@@ -0,0 +1,172 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (idct8_f64, "double *dest, int dstr, double *src, int sstr");
+
+#define C0_9808 0.980785280
+#define C0_9239 0.923879532
+#define C0_8315 0.831469612
+#define C0_7071 0.707106781
+#define C0_5556 0.555570233
+#define C0_3827 0.382683432
+#define C0_1951 0.195090322
+
+static void idct8_f64_ref(double *dest, int dstr, double *src, int sstr)
+{
+ static double idct_coeff[8][8];
+ static int idct_coeff_init = 0;
+ int i,j;
+ double x;
+
+ if(!idct_coeff_init){
+ double scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ idct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ idct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ x = 0;
+ for(j=0;j<8;j++){
+ x += idct_coeff[i][j] * OIL_GET (src, sstr*j, double);
+ }
+ OIL_GET (dest, dstr*i, double) = x;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (idct8_f64_ref, idct8_f64_class);
+
+
+static void idct8_f64_fastx(double *dest, int dstr, double *src, int sstr)
+{
+ double s07, s16, s25, s34;
+ double d07, d16, d25, d34;
+ double ss07s34, ss16s25;
+ double ds07s34, ds16s25;
+
+ ss07s34 = C0_7071*(OIL_GET(src,sstr*0, double) + OIL_GET(src,sstr*4, double));
+ ss16s25 = C0_7071*(OIL_GET(src,sstr*0, double) - OIL_GET(src,sstr*4, double));
+
+ ds07s34 = C0_9239* OIL_GET(src,sstr*2, double) + C0_3827* OIL_GET(src,sstr*6, double);
+ ds16s25 = C0_3827* OIL_GET(src,sstr*2, double) - C0_9239* OIL_GET(src,sstr*6, double);
+
+ s07 = ss07s34 + ds07s34;
+ s34 = ss07s34 - ds07s34;
+
+ s16 = ss16s25 + ds16s25;
+ s25 = ss16s25 - ds16s25;
+
+ d07 = C0_9808* OIL_GET(src,sstr*1, double) + C0_8315* OIL_GET(src,sstr*3, double)
+ + C0_5556* OIL_GET(src,sstr*5, double) + C0_1951* OIL_GET(src,sstr*7, double);
+ d16 = C0_8315* OIL_GET(src,sstr*1, double) - C0_1951* OIL_GET(src,sstr*3, double)
+ - C0_9808* OIL_GET(src,sstr*5, double) - C0_5556* OIL_GET(src,sstr*7, double);
+ d25 = C0_5556* OIL_GET(src,sstr*1, double) - C0_9808* OIL_GET(src,sstr*3, double)
+ + C0_1951* OIL_GET(src,sstr*5, double) + C0_8315* OIL_GET(src,sstr*7, double);
+ d34 = C0_1951* OIL_GET(src,sstr*1, double) - C0_5556* OIL_GET(src,sstr*3, double)
+ + C0_8315* OIL_GET(src,sstr*5, double) - C0_9808* OIL_GET(src,sstr*7, double);
+
+ OIL_GET(dest,dstr*0, double) = 0.5 * (s07 + d07);
+ OIL_GET(dest,dstr*1, double) = 0.5 * (s16 + d16);
+ OIL_GET(dest,dstr*2, double) = 0.5 * (s25 + d25);
+ OIL_GET(dest,dstr*3, double) = 0.5 * (s34 + d34);
+ OIL_GET(dest,dstr*4, double) = 0.5 * (s34 - d34);
+ OIL_GET(dest,dstr*5, double) = 0.5 * (s25 - d25);
+ OIL_GET(dest,dstr*6, double) = 0.5 * (s16 - d16);
+ OIL_GET(dest,dstr*7, double) = 0.5 * (s07 - d07);
+
+}
+
+OIL_DEFINE_IMPL (idct8_f64_fastx, idct8_f64_class);
+
+
+#ifdef TEST_idct8_f64
+int TEST_idct8_f64(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ double *src, *dest_ref, *dest_test;
+ double sad;
+ double sad_sum;
+ double sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f64(8);
+ dest_ref = sl_malloc_f64(8);
+ dest_test = sl_malloc_f64(8);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(idct8_f64_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<8;i++)src[i] = sl_rand_f64_0_1();
+
+ //block8_dump(src);
+
+ idct8_f64_ref(dest_test, src, 8, 8);
+ //block8_dump(dest_test);
+
+ sl_profile_start(t);
+ idct8_f64_FUNC(dest_ref, src, 8, 8);
+ sl_profile_stop(t);
+ //block8_dump(dest_ref);
+
+ sad = 0;
+ for(i=0;i<8;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 1.0){
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",sad_sum/N_PASS);
+ printf("sad max: %g\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8_c.c b/liboil/dct/idct8x8_c.c
new file mode 100644
index 0000000..648065c
--- /dev/null
+++ b/liboil/dct/idct8x8_c.c
@@ -0,0 +1,118 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2001,2002,2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboil.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+/* for bootstrapping */
+#ifndef idct8x8_f64
+extern OilFunctionClass _oil_function_idct8x8_f64_class;
+#define idct8x8_f64 ((void (*)(double *dest, int dstr, double *src, int sstr)) \
+ _oil_function_idct8x8_f64_class.func)
+#endif
+
+#define BLOCK8x8_F64(ptr, stride, row, column) \
+ (*((double *)((void *)ptr + stride*row) + column))
+
+#define BLOCK8x8_PTR_F64(ptr, stride, row, column) \
+ ((double *)((void *)ptr + stride*row) + column)
+
+#define BLOCK8x8_S16(ptr, stride, row, column) \
+ (*((int16_t *)((void *)ptr + stride*row) + column))
+
+OIL_DEFINE_CLASS_X (idct8x8_f64, "double *dest, int dstr, double *src, int sstr");
+OIL_DEFINE_CLASS_X (idct8x8_s16, "int16_t *dest, int dstr, int16_t *src, int sstr");
+
+static void
+idct8x8_f64_slow (double *dest, int dstr, double *src, int sstr)
+{
+ static double idct_coeff[8][8];
+ static int idct_coeff_init = 0;
+ int i,j,k,l;
+ double tmp1,tmp2;
+
+ if(!idct_coeff_init){
+ double scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ idct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ idct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ tmp1 = 0;
+ for(k=0;k<8;k++){
+ tmp2 = 0;
+ for(l=0;l<8;l++){
+ tmp2 += idct_coeff[j][l] *
+ BLOCK8x8_F64(src,sstr,k,l);
+ }
+ tmp1 += idct_coeff[i][k] * tmp2;
+ }
+ BLOCK8x8_F64(dest,dstr,i,j) = tmp1;
+ }
+ }
+}
+
+OIL_DEFINE_IMPL (idct8x8_f64_slow, idct8x8_f64_class);
+
+static void
+idct8x8_f64_c (double *dest, int dstr, double *src, int sstr)
+{
+ int i;
+ double tmp[64];
+ int tmpstr = 8*sizeof(double);
+
+ for(i=0;i<8;i++){
+ idct8_f64(
+ BLOCK8x8_PTR_F64(tmp,tmpstr,i,0), sizeof(double),
+ BLOCK8x8_PTR_F64(src,sstr,i,0), sizeof(double));
+ }
+ for(i=0;i<8;i++){
+ idct8_f64(
+ BLOCK8x8_PTR_F64(dest,dstr,0,i), dstr,
+ BLOCK8x8_PTR_F64(tmp,tmpstr,0,i), tmpstr);
+ }
+}
+
+OIL_DEFINE_IMPL_DEPENDS (idct8x8_f64_c, idct8x8_f64_class, idct8_f64_class);
+
+static void
+idct8x8_s16_slow (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+ double s[64], d[64];
+
+ conv8x8_f64_s16 (s,8*sizeof(double),src,sstr);
+ idct8x8_f64 (d,8*sizeof(double),s,8*sizeof(double));
+ conv8x8_s16_f64 (dest,dstr,d,8*sizeof(double));
+}
+
+OIL_DEFINE_IMPL_DEPENDS (idct8x8_s16_slow, idct8x8_s16_class,
+ conv8x8_f64_s16_class, idct8x8_f64_class, conv8x8_s16_f64);
+
diff --git a/liboil/dct/idct8x8_f64.c b/liboil/dct/idct8x8_f64.c
new file mode 100644
index 0000000..9e46523
--- /dev/null
+++ b/liboil/dct/idct8x8_f64.c
@@ -0,0 +1,206 @@
+/* inverse discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8_f64
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _idct8x8_f64_h_
+#define _idct8x8_f64_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8_f64_storage
+ #ifdef SL_storage
+ #define SL_idct8x8_f64_storage SL_storage
+ #else
+ #define SL_idct8x8_f64_storage static inline
+ #endif
+#endif
+
+
+/* IMPL idct8x8_f64_ref */
+SL_idct8x8_f64_storage
+void idct8x8_f64_ref(f64 *dest, f64 *src, int dstr, int sstr)
+{
+ static f64 idct_coeff[8][8];
+ static int idct_coeff_init = 0;
+ int i,j,k,l;
+ f64 tmp1,tmp2;
+
+ if(!idct_coeff_init){
+ f64 scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ idct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ idct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ tmp1 = 0;
+ for(k=0;k<8;k++){
+ tmp2 = 0;
+ for(l=0;l<8;l++){
+ tmp2 += idct_coeff[j][l] *
+ block8x8_f64(src,sstr,k,l);
+ }
+ tmp1 += idct_coeff[i][k] * tmp2;
+ }
+ block8x8_f64(dest,dstr,i,j) = tmp1;
+ }
+ }
+}
+
+/* IMPL idct8x8_f64_ref2 */
+SL_idct8x8_f64_storage
+void idct8x8_f64_ref2(f64 *dest, f64 *src, int dstr, int sstr)
+{
+ static f64 idct_coeff[8][8];
+ static int idct_coeff_init = 0;
+ int i,j,k;
+ f64 x;
+ f64 tmp[64];
+
+ if(!idct_coeff_init){
+ f64 scale;
+
+ for(i=0;i<8;i++){
+ scale = (i==0) ? sqrt(0.125) : 0.5;
+ for(j=0;j<8;j++){
+ idct_coeff[j][i] = scale *
+ cos((M_PI/8)*i*(j+0.5));
+ }
+ }
+ idct_coeff_init = 1;
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ x = 0;
+ for(k=0;k<8;k++){
+ x += idct_coeff[j][k] *
+ block8x8_f64(src,sstr,i,k);
+ }
+ tmp[8*i+j] = x;
+ }
+ }
+
+ for(j=0;j<8;j++){
+ for(i=0;i<8;i++){
+ x = 0;
+ for(k=0;k<8;k++){
+ x += idct_coeff[i][k] * tmp[8*k + j];
+ }
+ block8x8_f64(dest,dstr,i,j) = x;
+ }
+ }
+}
+
+#define f64_addr(base,str,i) ((f64 *)((void *)(base) + (str)*(i)))
+
+#include <idct8_f64.h>
+/* IMPL idct8x8_f64_1d */
+SL_idct8x8_f64_storage
+void idct8x8_f64_1d(f64 *dest, f64 *src, int dstr, int sstr)
+{
+ int i;
+ f64 tmp[64];
+
+ for(i=0;i<8;i++){
+ idct8_f64_fast(tmp + i*8, f64_addr(src,sstr,i), sizeof(f64), sizeof(f64));
+ }
+ for(i=0;i<8;i++){
+ idct8_f64_fast(dest + i, tmp + i, dstr, 8*sizeof(f64));
+ }
+}
+
+
+#endif
+
+
+#ifdef TEST_idct8x8_f64
+int TEST_idct8x8_f64(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ f64 *src, *dest_ref, *dest_test;
+ f64 sad;
+ f64 sad_sum;
+ f64 sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_f64(64);
+ dest_ref = sl_malloc_f64(64);
+ dest_test = sl_malloc_f64(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(idct8x8_f64_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<64;i++)src[i] = sl_rand_f64_0_1();
+
+ idct8x8_f64_ref(dest_test, src, 8, 8);
+ sl_profile_start(t);
+ idct8x8_f64_FUNC(dest_ref, src, 8, 8);
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += fabs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 1.0){
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",sad_sum/N_PASS);
+ printf("sad max: %g\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8_s16.c b/liboil/dct/idct8x8_s16.c
new file mode 100644
index 0000000..1e58026
--- /dev/null
+++ b/liboil/dct/idct8x8_s16.c
@@ -0,0 +1,144 @@
+/* forward discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+XXX
+*/
+
+#ifndef _idct8x8_s16_h_
+#define _idct8x8_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8_s16_storage
+ #ifdef SL_storage
+ #define SL_idct8x8_s16_storage SL_storage
+ #else
+ #define SL_idct8x8_s16_storage static inline
+ #endif
+#endif
+
+
+
+#include <idct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL idct8x8_s16_ref */
+SL_idct8x8_s16_storage
+void idct8x8_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+ f64 s[64], d[64];
+ int i,j;
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ block8x8_f64(s,8*sizeof(f64),i,j) =
+ block8x8_s16(src,sstr,i,j);
+ }
+ }
+
+ idct8x8_f64_ref(d,s,8*sizeof(f64),8*sizeof(f64));
+ conv8x8_f64_s16_ref(dest,d,dstr,8*sizeof(f64));
+}
+
+/* IMPL idct8x8_s16_fast */
+SL_idct8x8_s16_storage
+void idct8x8_s16_fast(s16 *dest, s16 *src, int dstr, int sstr)
+{
+ f64 s[64], d[64];
+ int i,j;
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ block8x8_f64(s,8*sizeof(f64),i,j) =
+ block8x8_s16(src,sstr,i,j);
+ }
+ }
+
+ idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+ conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_idct8x8_s16
+int TEST_idct8x8_s16(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ s16 *src, *dest_ref, *dest_test;
+ u32 sad;
+ u32 sad_sum;
+ u32 sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_s16(64);
+ dest_ref = sl_malloc_s16(64);
+ dest_test = sl_malloc_s16(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(idct8x8_s16_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+
+ idct8x8_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_start(t);
+ idct8x8_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 64){
+ block8x8_dump_s16(src, 8*sizeof(s16));
+ block8x8_dump_s16(dest_test, 8*sizeof(s16));
+ block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+ printf("sad max: %d\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/idct8x8s_s16.c b/liboil/dct/idct8x8s_s16.c
new file mode 100644
index 0000000..59e9709
--- /dev/null
+++ b/liboil/dct/idct8x8s_s16.c
@@ -0,0 +1,148 @@
+/* inverse discrete cosine transform on 8x8 block
+ * Copyright (C) 2001,2002 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+Kernel: idct8x8s_s16
+Description: inverse discrete cosine transform on 8x8 block
+
+Alternate scaling used by RTjpeg.
+*/
+
+#ifndef _idct8x8s_s16_h_
+#define _idct8x8s_s16_h_
+
+#include <math.h>
+
+#include <sl_types.h>
+#include <sl_block8x8.h>
+
+/* storage class */
+#ifndef SL_idct8x8s_s16_storage
+ #ifdef SL_storage
+ #define SL_idct8x8s_s16_storage SL_storage
+ #else
+ #define SL_idct8x8s_s16_storage static inline
+ #endif
+#endif
+
+
+/* extras */
+#include <rtjpeg/idct8x8s_s16.h>
+
+
+#include <idct8x8_f64.h>
+#include <conv8x8_f64_s16.h>
+/* IMPL idct8x8s_s16_ref */
+SL_idct8x8s_s16_storage
+void idct8x8s_s16_ref(s16 *dest, s16 *src, int dstr, int sstr)
+{
+ f64 s[64], d[64];
+ const f64 scale[8] = {
+ 2.0/C0_7071,
+ 2.0/C0_9808,
+ 2.0/C0_9239,
+ 2.0/C0_8315,
+ 2.0/C0_7071,
+ 2.0/C0_5556,
+ 2.0/C0_3827,
+ 2.0/C0_1951,
+ };
+ int i,j;
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ block8x8_f64(s,8*sizeof(f64),i,j) =
+ block8x8_s16(src,sstr,i,j);
+ }
+ }
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ block8x8_f64(s,8*sizeof(f64),i,j) *= scale[i] * scale[j];
+ }
+ }
+
+ idct8x8_f64(d,s,8*sizeof(f64),8*sizeof(f64));
+
+ conv8x8_f64_s16(dest,d,dstr,8*sizeof(f64));
+}
+#endif
+
+#ifdef TEST_idct8x8s_s16
+int TEST_idct8x8s_s16(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ s16 *src, *dest_ref, *dest_test;
+ u32 sad;
+ u32 sad_sum;
+ u32 sad_max;
+ struct sl_profile_struct t;
+
+ src = sl_malloc_s16(64);
+ dest_ref = sl_malloc_s16(64);
+ dest_test = sl_malloc_s16(64);
+
+ sl_profile_init(t);
+ srand(20020306);
+
+ sad_sum = 0;
+ sad_max = 0;
+
+ printf("I: " sl_stringify(idct8x8s_s16_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ //for(i=0;i<64;i++)src[i] = sl_rand_s16_l9();
+ for(i=0;i<64;i++)src[i] = (i==pass)*1000;
+
+ idct8x8s_s16_ref(dest_ref, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_start(t);
+ idct8x8s_s16_FUNC(dest_test, src, 8*sizeof(s16), 8*sizeof(s16));
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<64;i++)sad += abs(dest_test[i] - dest_ref[i]);
+ if(sad_max<sad)sad_max = sad;
+ sad_sum += sad;
+ if(sad >= 128){
+ block8x8_dump_s16(src, 8*sizeof(s16));
+ block8x8_dump_s16(dest_ref, 8*sizeof(s16));
+ block8x8_dump_s16(dest_test, 8*sizeof(s16));
+ block8x8_dumpratio_s16(dest_test, dest_ref, 8*sizeof(s16));
+ failures++;
+ }
+ }
+ printf("sad average: %g\n",((double)sad_sum)/N_PASS);
+ printf("sad max: %d\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+
diff --git a/liboil/dct/imdct32_f32.c b/liboil/dct/imdct32_f32.c
new file mode 100644
index 0000000..0a39950
--- /dev/null
+++ b/liboil/dct/imdct32_f32.c
@@ -0,0 +1,438 @@
+/* liboil - Library of Optimized Inner Loops
+ * Copyright (C) 2003 David A. Schleef <ds@schleef.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of version 2.1 of the GNU Lesser General
+ * Public License as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+OIL_DEFINE_CLASS_X (imdct32_f32, "float *dest, float *src");
+
+static void imdct32_f32_ref (float *dest, float *src)
+{
+ double x;
+ int i,j;
+ double coeff;
+
+ for(i=0;i<32;i++){
+ x = 0;
+ for(j=0;j<32;j++){
+ coeff = cos((M_PI/32)*i*(j+0.5));
+ x += coeff * src[j];
+ }
+ dest[i] = x;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (imdct32_f32_ref, imdct32_f32_class);
+
+/* from mpglib */
+/*
+ * Discrete Cosine Tansform (DCT) for subband synthesis
+ * optimized for machines with no auto-increment.
+ * The performance is highly compiler dependend. Maybe
+ * the dct64.c version for 'normal' processor may be faster
+ * even for Intel processors.
+ */
+
+static void imdct32_f32_mpglib(float *dest,float *samples)
+{
+ static float cos64[16];
+ static float cos32[16];
+ static float cos16[16];
+ static float cos8[16];
+ static float cos4[16];
+ float b1[32];
+ float b2[32];
+ static int done = 0;
+
+ if(!done){
+ int k;
+
+ done = 1;
+ for(k=0;k<16;k++){
+ cos64[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 64.0));
+ cos32[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 32.0));
+ cos16[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 16.0));
+ cos8[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 8.0));
+ cos4[k] = 1.0 / (2.0 * cos(M_PI * (k * 2.0 + 1.0) / 4.0));
+ }
+ }
+
+ {
+ float *costab = cos64;
+
+ b1[0x00] = samples[0x00] + samples[0x1F];
+ b1[0x1F] = (samples[0x00] - samples[0x1F]) * costab[0x0];
+
+ b1[0x01] = samples[0x01] + samples[0x1E];
+ b1[0x1E] = (samples[0x01] - samples[0x1E]) * costab[0x1];
+
+ b1[0x02] = samples[0x02] + samples[0x1D];
+ b1[0x1D] = (samples[0x02] - samples[0x1D]) * costab[0x2];
+
+ b1[0x03] = samples[0x03] + samples[0x1C];
+ b1[0x1C] = (samples[0x03] - samples[0x1C]) * costab[0x3];
+
+ b1[0x04] = samples[0x04] + samples[0x1B];
+ b1[0x1B] = (samples[0x04] - samples[0x1B]) * costab[0x4];
+
+ b1[0x05] = samples[0x05] + samples[0x1A];
+ b1[0x1A] = (samples[0x05] - samples[0x1A]) * costab[0x5];
+
+ b1[0x06] = samples[0x06] + samples[0x19];
+ b1[0x19] = (samples[0x06] - samples[0x19]) * costab[0x6];
+
+ b1[0x07] = samples[0x07] + samples[0x18];
+ b1[0x18] = (samples[0x07] - samples[0x18]) * costab[0x7];
+
+ b1[0x08] = samples[0x08] + samples[0x17];
+ b1[0x17] = (samples[0x08] - samples[0x17]) * costab[0x8];
+
+ b1[0x09] = samples[0x09] + samples[0x16];
+ b1[0x16] = (samples[0x09] - samples[0x16]) * costab[0x9];
+
+ b1[0x0A] = samples[0x0A] + samples[0x15];
+ b1[0x15] = (samples[0x0A] - samples[0x15]) * costab[0xA];
+
+ b1[0x0B] = samples[0x0B] + samples[0x14];
+ b1[0x14] = (samples[0x0B] - samples[0x14]) * costab[0xB];
+
+ b1[0x0C] = samples[0x0C] + samples[0x13];
+ b1[0x13] = (samples[0x0C] - samples[0x13]) * costab[0xC];
+
+ b1[0x0D] = samples[0x0D] + samples[0x12];
+ b1[0x12] = (samples[0x0D] - samples[0x12]) * costab[0xD];
+
+ b1[0x0E] = samples[0x0E] + samples[0x11];
+ b1[0x11] = (samples[0x0E] - samples[0x11]) * costab[0xE];
+
+ b1[0x0F] = samples[0x0F] + samples[0x10];
+ b1[0x10] = (samples[0x0F] - samples[0x10]) * costab[0xF];
+ }
+
+
+ {
+ float *costab = cos32;
+
+ b2[0x00] = b1[0x00] + b1[0x0F];
+ b2[0x0F] = (b1[0x00] - b1[0x0F]) * costab[0];
+ b2[0x01] = b1[0x01] + b1[0x0E];
+ b2[0x0E] = (b1[0x01] - b1[0x0E]) * costab[1];
+ b2[0x02] = b1[0x02] + b1[0x0D];
+ b2[0x0D] = (b1[0x02] - b1[0x0D]) * costab[2];
+ b2[0x03] = b1[0x03] + b1[0x0C];
+ b2[0x0C] = (b1[0x03] - b1[0x0C]) * costab[3];
+ b2[0x04] = b1[0x04] + b1[0x0B];
+ b2[0x0B] = (b1[0x04] - b1[0x0B]) * costab[4];
+ b2[0x05] = b1[0x05] + b1[0x0A];
+ b2[0x0A] = (b1[0x05] - b1[0x0A]) * costab[5];
+ b2[0x06] = b1[0x06] + b1[0x09];
+ b2[0x09] = (b1[0x06] - b1[0x09]) * costab[6];
+ b2[0x07] = b1[0x07] + b1[0x08];
+ b2[0x08] = (b1[0x07] - b1[0x08]) * costab[7];
+
+ b2[0x10] = b1[0x10] + b1[0x1F];
+ b2[0x1F] = (b1[0x1F] - b1[0x10]) * costab[0];
+ b2[0x11] = b1[0x11] + b1[0x1E];
+ b2[0x1E] = (b1[0x1E] - b1[0x11]) * costab[1];
+ b2[0x12] = b1[0x12] + b1[0x1D];
+ b2[0x1D] = (b1[0x1D] - b1[0x12]) * costab[2];
+ b2[0x13] = b1[0x13] + b1[0x1C];
+ b2[0x1C] = (b1[0x1C] - b1[0x13]) * costab[3];
+ b2[0x14] = b1[0x14] + b1[0x1B];
+ b2[0x1B] = (b1[0x1B] - b1[0x14]) * costab[4];
+ b2[0x15] = b1[0x15] + b1[0x1A];
+ b2[0x1A] = (b1[0x1A] - b1[0x15]) * costab[5];
+ b2[0x16] = b1[0x16] + b1[0x19];
+ b2[0x19] = (b1[0x19] - b1[0x16]) * costab[6];
+ b2[0x17] = b1[0x17] + b1[0x18];
+ b2[0x18] = (b1[0x18] - b1[0x17]) * costab[7];
+ }
+
+ {
+ float *costab = cos16;
+
+ b1[0x00] = b2[0x00] + b2[0x07];
+ b1[0x07] = (b2[0x00] - b2[0x07]) * costab[0];
+ b1[0x01] = b2[0x01] + b2[0x06];
+ b1[0x06] = (b2[0x01] - b2[0x06]) * costab[1];
+ b1[0x02] = b2[0x02] + b2[0x05];
+ b1[0x05] = (b2[0x02] - b2[0x05]) * costab[2];
+ b1[0x03] = b2[0x03] + b2[0x04];
+ b1[0x04] = (b2[0x03] - b2[0x04]) * costab[3];
+
+ b1[0x08] = b2[0x08] + b2[0x0F];
+ b1[0x0F] = (b2[0x0F] - b2[0x08]) * costab[0];
+ b1[0x09] = b2[0x09] + b2[0x0E];
+ b1[0x0E] = (b2[0x0E] - b2[0x09]) * costab[1];
+ b1[0x0A] = b2[0x0A] + b2[0x0D];
+ b1[0x0D] = (b2[0x0D] - b2[0x0A]) * costab[2];
+ b1[0x0B] = b2[0x0B] + b2[0x0C];
+ b1[0x0C] = (b2[0x0C] - b2[0x0B]) * costab[3];
+
+ b1[0x10] = b2[0x10] + b2[0x17];
+ b1[0x17] = (b2[0x10] - b2[0x17]) * costab[0];
+ b1[0x11] = b2[0x11] + b2[0x16];
+ b1[0x16] = (b2[0x11] - b2[0x16]) * costab[1];
+ b1[0x12] = b2[0x12] + b2[0x15];
+ b1[0x15] = (b2[0x12] - b2[0x15]) * costab[2];
+ b1[0x13] = b2[0x13] + b2[0x14];
+ b1[0x14] = (b2[0x13] - b2[0x14]) * costab[3];
+
+ b1[0x18] = b2[0x18] + b2[0x1F];
+ b1[0x1F] = (b2[0x1F] - b2[0x18]) * costab[0];
+ b1[0x19] = b2[0x19] + b2[0x1E];
+ b1[0x1E] = (b2[0x1E] - b2[0x19]) * costab[1];
+ b1[0x1A] = b2[0x1A] + b2[0x1D];
+ b1[0x1D] = (b2[0x1D] - b2[0x1A]) * costab[2];
+ b1[0x1B] = b2[0x1B] + b2[0x1C];
+ b1[0x1C] = (b2[0x1C] - b2[0x1B]) * costab[3];
+ }
+
+ {
+ float cos0 = cos8[0];
+ float cos1 = cos8[1];
+
+ b2[0x00] = b1[0x00] + b1[0x03];
+ b2[0x03] = (b1[0x00] - b1[0x03]) * cos0;
+ b2[0x01] = b1[0x01] + b1[0x02];
+ b2[0x02] = (b1[0x01] - b1[0x02]) * cos1;
+
+ b2[0x04] = b1[0x04] + b1[0x07];
+ b2[0x07] = (b1[0x07] - b1[0x04]) * cos0;
+ b2[0x05] = b1[0x05] + b1[0x06];
+ b2[0x06] = (b1[0x06] - b1[0x05]) * cos1;
+
+ b2[0x08] = b1[0x08] + b1[0x0B];
+ b2[0x0B] = (b1[0x08] - b1[0x0B]) * cos0;
+ b2[0x09] = b1[0x09] + b1[0x0A];
+ b2[0x0A] = (b1[0x09] - b1[0x0A]) * cos1;
+
+ b2[0x0C] = b1[0x0C] + b1[0x0F];
+ b2[0x0F] = (b1[0x0F] - b1[0x0C]) * cos0;
+ b2[0x0D] = b1[0x0D] + b1[0x0E];
+ b2[0x0E] = (b1[0x0E] - b1[0x0D]) * cos1;
+
+ b2[0x10] = b1[0x10] + b1[0x13];
+ b2[0x13] = (b1[0x10] - b1[0x13]) * cos0;
+ b2[0x11] = b1[0x11] + b1[0x12];
+ b2[0x12] = (b1[0x11] - b1[0x12]) * cos1;
+
+ b2[0x14] = b1[0x14] + b1[0x17];
+ b2[0x17] = (b1[0x17] - b1[0x14]) * cos0;
+ b2[0x15] = b1[0x15] + b1[0x16];
+ b2[0x16] = (b1[0x16] - b1[0x15]) * cos1;
+
+ b2[0x18] = b1[0x18] + b1[0x1B];
+ b2[0x1B] = (b1[0x18] - b1[0x1B]) * cos0;
+ b2[0x19] = b1[0x19] + b1[0x1A];
+ b2[0x1A] = (b1[0x19] - b1[0x1A]) * cos1;
+
+ b2[0x1C] = b1[0x1C] + b1[0x1F];
+ b2[0x1F] = (b1[0x1F] - b1[0x1C]) * cos0;
+ b2[0x1D] = b1[0x1D] + b1[0x1E];
+ b2[0x1E] = (b1[0x1E] - b1[0x1D]) * cos1;
+ }
+
+ {
+ float cos0 = cos4[0];
+
+ b1[0x00] = b2[0x00] + b2[0x01];
+ b1[0x01] = (b2[0x00] - b2[0x01]) * cos0;
+ b1[0x02] = b2[0x02] + b2[0x03];
+ b1[0x03] = (b2[0x03] - b2[0x02]) * cos0;
+ b1[0x02] += b1[0x03];
+
+ b1[0x04] = b2[0x04] + b2[0x05];
+ b1[0x05] = (b2[0x04] - b2[0x05]) * cos0;
+ b1[0x06] = b2[0x06] + b2[0x07];
+ b1[0x07] = (b2[0x07] - b2[0x06]) * cos0;
+ b1[0x06] += b1[0x07];
+ b1[0x04] += b1[0x06];
+ b1[0x06] += b1[0x05];
+ b1[0x05] += b1[0x07];
+
+ b1[0x08] = b2[0x08] + b2[0x09];
+ b1[0x09] = (b2[0x08] - b2[0x09]) * cos0;
+ b1[0x0A] = b2[0x0A] + b2[0x0B];
+ b1[0x0B] = (b2[0x0B] - b2[0x0A]) * cos0;
+ b1[0x0A] += b1[0x0B];
+
+ b1[0x0C] = b2[0x0C] + b2[0x0D];
+ b1[0x0D] = (b2[0x0C] - b2[0x0D]) * cos0;
+ b1[0x0E] = b2[0x0E] + b2[0x0F];
+ b1[0x0F] = (b2[0x0F] - b2[0x0E]) * cos0;
+ b1[0x0E] += b1[0x0F];
+ b1[0x0C] += b1[0x0E];
+ b1[0x0E] += b1[0x0D];
+ b1[0x0D] += b1[0x0F];
+
+ b1[0x10] = b2[0x10] + b2[0x11];
+ b1[0x11] = (b2[0x10] - b2[0x11]) * cos0;
+ b1[0x12] = b2[0x12] + b2[0x13];
+ b1[0x13] = (b2[0x13] - b2[0x12]) * cos0;
+ b1[0x12] += b1[0x13];
+
+ b1[0x14] = b2[0x14] + b2[0x15];
+ b1[0x15] = (b2[0x14] - b2[0x15]) * cos0;
+ b1[0x16] = b2[0x16] + b2[0x17];
+ b1[0x17] = (b2[0x17] - b2[0x16]) * cos0;
+ b1[0x16] += b1[0x17];
+ b1[0x14] += b1[0x16];
+ b1[0x16] += b1[0x15];
+ b1[0x15] += b1[0x17];
+
+ b1[0x18] = b2[0x18] + b2[0x19];
+ b1[0x19] = (b2[0x18] - b2[0x19]) * cos0;
+ b1[0x1A] = b2[0x1A] + b2[0x1B];
+ b1[0x1B] = (b2[0x1B] - b2[0x1A]) * cos0;
+ b1[0x1A] += b1[0x1B];
+
+ b1[0x1C] = b2[0x1C] + b2[0x1D];
+ b1[0x1D] = (b2[0x1C] - b2[0x1D]) * cos0;
+ b1[0x1E] = b2[0x1E] + b2[0x1F];
+ b1[0x1F] = (b2[0x1F] - b2[0x1E]) * cos0;
+ b1[0x1E] += b1[0x1F];
+ b1[0x1C] += b1[0x1E];
+ b1[0x1E] += b1[0x1D];
+ b1[0x1D] += b1[0x1F];
+ }
+
+ dest[ 0] = b1[0x00];
+ dest[ 4] = b1[0x04];
+ dest[ 8] = b1[0x02];
+ dest[12] = b1[0x06];
+// dest[0x10* 0] = b1[0x01]; /* I think this is wrong */
+ dest[16] = b1[0x01];
+ dest[20] = b1[0x05];
+ dest[24] = b1[0x03];
+ dest[28] = b1[0x07];
+
+ b1[0x08] += b1[0x0C];
+ dest[2] = b1[0x08];
+ b1[0x0C] += b1[0x0a];
+ dest[6] = b1[0x0C];
+ b1[0x0A] += b1[0x0E];
+ dest[10] = b1[0x0A];
+ b1[0x0E] += b1[0x09];
+ dest[14] = b1[0x0E];
+ b1[0x09] += b1[0x0D];
+ dest[18] = b1[0x09];
+ b1[0x0D] += b1[0x0B];
+ dest[22] = b1[0x0D];
+ b1[0x0B] += b1[0x0F];
+ dest[26] = b1[0x0B];
+ dest[30] = b1[0x0F];
+
+ b1[0x18] += b1[0x1C];
+ dest[1] = b1[0x10] + b1[0x18];
+ dest[3] = b1[0x18] + b1[0x14];
+ b1[0x1C] += b1[0x1a];
+ dest[5] = b1[0x14] + b1[0x1C];
+ dest[7] = b1[0x1C] + b1[0x12];
+ b1[0x1A] += b1[0x1E];
+ dest[9] = b1[0x12] + b1[0x1A];
+ dest[11] = b1[0x1A] + b1[0x16];
+ b1[0x1E] += b1[0x19];
+ dest[13] = b1[0x16] + b1[0x1E];
+ dest[15] = b1[0x1E] + b1[0x11];
+ b1[0x19] += b1[0x1D];
+ dest[17] = b1[0x11] + b1[0x19];
+ dest[19] = b1[0x19] + b1[0x15];
+ b1[0x1D] += b1[0x1B];
+ dest[21] = b1[0x15] + b1[0x1D];
+ dest[23] = b1[0x1D] + b1[0x13];
+ b1[0x1B] += b1[0x1F];
+ dest[25] = b1[0x13] + b1[0x1B];
+ dest[27] = b1[0x1B] + b1[0x17];
+ dest[29] = b1[0x17] + b1[0x1F];
+ dest[31] = b1[0x1F];
+}
+
+OIL_DEFINE_IMPL (imdct32_f32_mpglib, imdct32_f32_class);
+
+
+
+#ifdef TEST_imdct32_f32
+int TEST_imdct32_f32(void)
+{
+ int i;
+ int pass;
+ int failures = 0;
+ f32 *src, *dest_ref, *dest_test;
+ struct sl_profile_struct t;
+ double sad;
+ double sad_max = 0;
+ double sad_sum = 0;
+
+ src = sl_malloc_f32(32);
+ dest_ref = sl_malloc_f32(32);
+ dest_test = sl_malloc_f32(32);
+
+ sl_profile_init(t);
+ srand(20021001);
+
+ printf("I: " sl_stringify(imdct32_f32_FUNC) "\n");
+
+ for(pass=0;pass<N_PASS;pass++){
+ for(i=0;i<32;i++)src[i]=sl_rand_f32_0_1();
+
+ imdct32_f32_ref(dest_ref,src);
+ sl_profile_start(t);
+ imdct32_f32_FUNC(dest_test,src);
+ sl_profile_stop(t);
+
+ sad = 0;
+ for(i=0;i<32;i++){
+ sad += fabs(dest_test[i] - dest_ref[i]);
+ }
+ if(sad>sad_max)sad_max = sad;
+ sad_sum += sad;
+#if 0
+ if(sad>0){
+ printf("sad = %g\n",sad);
+ }
+#endif
+#if 0
+ if(dest_test[i] != dest_ref[i]){
+ printf("%d %g %g\n",i,dest_ref[i], dest_test[i]);
+ }
+#endif
+ }
+
+ printf("sad ave = %g\n",sad_sum/N_PASS);
+ printf("sad max = %g\n",sad_max);
+
+ sl_free(src);
+ sl_free(dest_ref);
+ sl_free(dest_test);
+
+ if(failures){
+ printf("E: %d failures\n",failures);
+ }
+
+ sl_profile_print(t);
+
+ return failures;
+}
+#endif
+