summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-04-30 06:00:56 +0000
committerDavid Schleef <ds@schleef.org>2005-04-30 06:00:56 +0000
commit81e196203fe61e8dedc5f10e03b9ec7c9d2a7eff (patch)
treeed85b753dc0cc8ef5c1083c2f704e150cd52b504
parenteb6ae36041277cde75b484d3e0fc35277f83a52a (diff)
downloadliboil-81e196203fe61e8dedc5f10e03b9ec7c9d2a7eff.tar.gz
* examples/Makefile.am: add oil-test
* examples/oil-test.c: A copy of work.c modified for displaying test results for any class. * liboil/dct/Makefile.am: * liboil/dct/idct8x8_i386.c: (idct8x8_s16_mmx), (fdct8x8s_s16_mmx): Add mmx code for idct * liboil/dct/idct8x8theora_ref.c: Add some classes for idct8x8 to the theora spec. * liboil/liboilfuncs.h: update
-rw-r--r--ChangeLog12
-rw-r--r--examples/Makefile.am6
-rw-r--r--examples/oil-test.c201
-rw-r--r--liboil/dct/Makefile.am3
-rw-r--r--liboil/dct/idct8x8_i386.c356
-rw-r--r--liboil/dct/idct8x8theora_ref.c200
-rw-r--r--liboil/liboilfuncs.h6
7 files changed, 769 insertions, 15 deletions
diff --git a/ChangeLog b/ChangeLog
index 45790e3..2b23927 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2005-04-29 David Schleef <ds@schleef.org>
+
+ * examples/Makefile.am: add oil-test
+ * examples/oil-test.c: A copy of work.c modified for displaying
+ test results for any class.
+ * liboil/dct/Makefile.am:
+ * liboil/dct/idct8x8_i386.c: (idct8x8_s16_mmx), (fdct8x8s_s16_mmx):
+ Add mmx code for idct
+ * liboil/dct/idct8x8theora_ref.c: Add some classes for idct8x8
+ to the theora spec.
+ * liboil/liboilfuncs.h: update
+
2005-04-28 David Schleef <ds@schleef.org>
Add an example huffman (variable code length) decoder
diff --git a/examples/Makefile.am b/examples/Makefile.am
index cb2eae1..9a1bda7 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
SUBDIRS = jpeg md5 uberopt work huffman
-noinst_PROGRAMS = example1 oil-inspect
+noinst_PROGRAMS = example1 oil-inspect oil-test
example1_SOURCES = example1.c
@@ -12,3 +12,7 @@ oil_inspect_SOURCES = oil-inspect.c
oil_inspect_CFLAGS = $(LIBOIL_CFLAGS)
oil_inspect_LDADD = $(LIBOIL_LIBS)
+oil_test_SOURCES = oil-test.c
+oil_test_CFLAGS = $(LIBOIL_CFLAGS)
+oil_test_LDADD = $(LIBOIL_LIBS)
+
diff --git a/examples/oil-test.c b/examples/oil-test.c
new file mode 100644
index 0000000..c7f369c
--- /dev/null
+++ b/examples/oil-test.c
@@ -0,0 +1,201 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboil.h>
+#include <liboil/liboilfunction.h>
+#include <liboil/liboiltest.h>
+#include <liboil/liboilrandom.h>
+#include <liboil/liboilcpu.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+
+void register_impls(void);
+
+void test(void)
+{
+ int32_t dest[1];
+ uint8_t src[100];
+ int i;
+
+ for(i=0;i<100;i++){
+ src[i] = oil_rand_u8() & 0x7f;
+ }
+ dest[0] = 0;
+
+ oil_utf8_validate (dest, src, 100);
+
+#if 0
+ for(i=0;i<100;i++){
+ printf("%d %d\n",dest[i],src[i]);
+ }
+#endif
+ printf("%d\n", dest[0]);
+
+}
+
+void
+dump_array (void *data, void *ref_data, OilType type, int pre_n, int stride,
+ int post_n)
+{
+ int i, j;
+ int s2 = oil_type_sizeof (type);
+ double x;
+
+#define DUMP(type, format) do { \
+ for(i=0;i<post_n;i++){ \
+ printf(" "); \
+ for(j=0;j<pre_n;j++){ \
+ x = fabs(OIL_GET(data, i*stride + j*s2, type) - \
+ OIL_GET(data, i*stride + j*s2, type)); \
+ if (x >= 1.0) { \
+ printf("[" format "] ", OIL_GET(data, i*stride + j*s2, type)); \
+ } else { \
+ printf(format " ", OIL_GET(data, i*stride + j*s2, type)); \
+ } \
+ } \
+ printf("\n"); \
+ } \
+} while(0)
+
+ switch(type) {
+ case OIL_TYPE_s8p:
+ DUMP(int8_t, "%d");
+ break;
+ case OIL_TYPE_u8p:
+ DUMP(uint8_t, "%d");
+ break;
+ case OIL_TYPE_s16p:
+ DUMP(int16_t, "%d");
+ break;
+ case OIL_TYPE_u16p:
+ DUMP(uint16_t, "%d");
+ break;
+ case OIL_TYPE_s32p:
+ DUMP(int32_t, "%d");
+ break;
+ case OIL_TYPE_u32p:
+ DUMP(uint32_t, "%u");
+ break;
+ case OIL_TYPE_f32p:
+ DUMP(float, "%g");
+ break;
+ case OIL_TYPE_f64p:
+ DUMP(double, "%g");
+ break;
+ default:
+ break;
+ }
+}
+
+void
+dump_test (OilTest *test)
+{
+ int i;
+ for(i=0;i<OIL_ARG_LAST;i++){
+ OilParameter *p = &test->params[i];
+ if (p->is_pointer) {
+ if (p->direction == 'i' || p->direction == 'd') {
+ printf (" %s:\n", p->parameter_name);
+ dump_array (p->test_data + OIL_TEST_HEADER,
+ p->ref_data + OIL_TEST_HEADER,
+ p->type, p->pre_n, p->stride, p->post_n);
+ }
+ }
+ }
+}
+
+void
+dump_source (OilTest *test)
+{
+ int i;
+ for(i=0;i<OIL_ARG_LAST;i++){
+ OilParameter *p = &test->params[i];
+ if (p->is_pointer) {
+ if (p->direction == 'i' || p->direction == 's') {
+ printf (" %s:\n", p->parameter_name);
+ dump_array (p->src_data + OIL_TEST_HEADER,
+ p->src_data + OIL_TEST_HEADER,
+ p->type, p->pre_n, p->stride, p->post_n);
+ }
+ }
+ }
+}
+
+int main (int argc, char *argv[])
+{
+ OilFunctionClass *klass;
+ OilFunctionImpl *impl;
+ OilTest *test;
+ double ave, std;
+
+ oil_init ();
+
+ if (argc < 2) {
+ printf("oil-test <class_name>\n");
+ exit(0);
+ }
+
+ klass = oil_class_get (argv[1]);
+ if (klass == NULL) {
+ printf("class not found: %s\n", argv[1]);
+ exit(0);
+ }
+ oil_class_optimize (klass);
+
+ test = oil_test_new(klass);
+ oil_test_set_iterations(test, 1);
+ test->n = 10;
+ test->m = 10;
+
+ impl = klass->reference_impl;
+ ave = impl->profile_ave;
+ std = impl->profile_std;
+ oil_test_check_impl (test, impl);
+ printf ("source array\n");
+ dump_source(test);
+ printf ("reference impl %s\n", impl->name);
+ printf(" ave=%g std=%g\n", ave, std);
+ dump_test(test);
+
+ for (impl = klass->first_impl; impl; impl = impl->next) {
+ if (impl == klass->reference_impl) continue;
+ printf ("impl %s\n", impl->name);
+ if (oil_impl_is_runnable (impl)) {
+ printf(" ave=%g std=%g\n", impl->profile_ave, impl->profile_std);
+ oil_test_check_impl (test, impl);
+ dump_test(test);
+ }
+ }
+
+ return 0;
+}
+
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am
index 9744e53..a407a45 100644
--- a/liboil/dct/Makefile.am
+++ b/liboil/dct/Makefile.am
@@ -25,7 +25,8 @@ c_sources = \
fdct8x8s_s16.c \
idct8_f64.c \
idct8x8_c.c \
- imdct32_f32.c
+ imdct32_f32.c \
+ idct8x8theora_ref.c
libdct_la_SOURCES = \
$(c_sources) \
diff --git a/liboil/dct/idct8x8_i386.c b/liboil/dct/idct8x8_i386.c
index 0f68c72..c1d79e9 100644
--- a/liboil/dct/idct8x8_i386.c
+++ b/liboil/dct/idct8x8_i386.c
@@ -37,20 +37,357 @@
OIL_DECLARE_CLASS (idct8x8_s16);
OIL_DECLARE_CLASS (dct8x8_s16);
-#if 0
+#define CONST(x) (32768.0*(x) + 0.5)
+
+#define C1_0000 (32767)
+#define C0_9808 CONST(0.980785280)
+#define C0_9239 CONST(0.923879532)
+#define C0_8315 CONST(0.831469612)
+#define C0_7071 CONST(0.707106781)
+#define C0_5556 CONST(0.555570233)
+#define C0_3827 CONST(0.382683432)
+#define C0_1951 CONST(0.195090322)
+
+#define FOUR(x) { x, x, x, x }
+#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5}
+
+static const int16_t
+dct_mmx_constants [][4] = {
+ FOUR(0),
+ FOUR(C0_9808),
+ FOUR(C0_9239),
+ FOUR(C0_8315),
+ FOUR(C0_7071),
+ FOUR(C0_5556),
+ FOUR(C0_3827),
+ FOUR(C0_1951),
+ { 1, 1, -1, -1 }, // 64
+ { 1, -1, 1, -1 },
+ { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80
+ { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88
+ { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96
+ { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104
+ { 1, -1, -1, -1 },
+ { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120
+ { 1, -1, 1, 1 },
+ { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136
+ { 1, -1, 1, -1 },
+ FOUR(CONST(0.5)), //152
+ { C0_7071, C0_9239, C0_7071, C0_3827 }, // 160
+ { C0_7071, C0_3827, C0_7071, C0_9239 }, // 168
+};
+
static void
idct8x8_s16_mmx (int16_t *dest, int dstr, int16_t *src, int sstr)
{
+ int32_t tmp[32];
+
asm volatile (
- ""
+ /* left half */
+ " movl %1, %%eax \n" // src
+ " movl %3, %%ebx \n" // sstr
+ " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
+
+ " movq (%%eax), %%mm0 \n"
+ " movq (%%eax), %%mm1 \n"
+ " paddsw (%%ecx), %%mm0 \n" // ss07s34
+ " psubsw (%%ecx), %%mm1 \n" // ss16s25
+ " pmulhw 32(%5), %%mm0 \n" // .7071
+ " pmulhw 32(%5), %%mm1 \n" // .7071
+
+ " movq (%%eax,%%ebx,2), %%mm2 \n"
+ " movq (%%eax,%%ebx,2), %%mm3 \n"
+ " movq (%%ecx,%%ebx,2), %%mm4 \n"
+ " movq (%%ecx,%%ebx,2), %%mm5 \n"
+ " pmulhw 16(%5), %%mm2 \n" // .9239
+ " pmulhw 48(%5), %%mm3 \n" // .3827
+ " pmulhw 48(%5), %%mm4 \n" // .3827
+ " pmulhw 16(%5), %%mm5 \n" // .9239
+ " paddsw %%mm4, %%mm2 \n" // ds07s34
+ " psubsw %%mm5, %%mm3 \n" // ds16s25
+
+ " movq %%mm0, %%mm4 \n"
+ " movq %%mm1, %%mm5 \n"
+ " paddsw %%mm2, %%mm0 \n" // s07
+ " psubsw %%mm2, %%mm4 \n" // s34
+ " paddsw %%mm3, %%mm1 \n" // s16
+ " psubsw %%mm3, %%mm5 \n" // s25
+
+ " movq %%mm0, 0(%4) \n"
+ " movq %%mm1, 8(%4) \n"
+ " movq %%mm5, 16(%4) \n"
+ " movq %%mm4, 24(%4) \n"
+
+ " addl %3, %%eax \n"
+ " addl %3, %%ecx \n"
+
+ " movq (%%eax), %%mm0 \n"
+ " pmulhw 8(%5), %%mm0 \n"
+ " movq (%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n"
+ " movq (%%ecx), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n"
+ " movq (%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n" // d07
+
+ " movq (%%eax), %%mm2 \n"
+ " pmulhw 24(%5), %%mm2 \n"
+ " movq (%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n"
+ " movq (%%ecx), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n"
+ " movq (%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n" // d16
+
+ " movq (%%eax), %%mm3 \n"
+ " pmulhw 40(%5), %%mm3 \n"
+ " movq (%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm3 \n"
+ " movq (%%ecx), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm3 \n"
+ " movq (%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm3 \n" // d25
+
+ " movq (%%eax), %%mm4 \n"
+ " pmulhw 56(%5), %%mm4 \n"
+ " movq (%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm4 \n"
+ " movq (%%ecx), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm4 \n"
+ " movq (%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm4 \n" // d34
+
+ " movl %0, %%eax \n" // dest
+ " movl %2, %%ebx \n" // dstr
+ " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3
+
+ " movq %%mm0, %%mm1 \n"
+ " paddsw 0(%4), %%mm1 \n"
+ " movq %%mm1, (%%eax) \n"
+
+ " movq %%mm2, %%mm1 \n"
+ " paddsw 8(%4), %%mm1 \n"
+ " movq %%mm1, (%%eax, %%ebx, 1) \n"
+
+ " movq %%mm3, %%mm1 \n"
+ " paddsw 16(%4), %%mm1 \n"
+ " movq %%mm1, (%%eax, %%ebx, 2) \n" // s25 + d25
+
+ " movq %%mm4, %%mm1 \n"
+ " paddsw 24(%4), %%mm1 \n"
+ " movq %%mm1, (%%eax, %%edx, 1) \n"
- : "+r" (dest), "+r" (src), "+r" (dstr), "+r" (sstr)
+ " leal (%%eax, %%ebx, 4), %%eax \n"
+ " movq 24(%4), %%mm1 \n"
+ " psubsw %%mm4, %%mm1 \n"
+ " movq %%mm1, (%%eax) \n"
+
+ " movq 16(%4), %%mm1 \n"
+ " psubsw %%mm3, %%mm1 \n"
+ " movq %%mm1, (%%eax, %%ebx, 1) \n"
+
+ " movq 8(%4), %%mm1 \n"
+ " psubsw %%mm2, %%mm1 \n"
+ " movq %%mm1, (%%eax, %%ebx, 2) \n"
+
+ " movq 0(%4), %%mm1 \n"
+ " psubsw %%mm0, %%mm1 \n"
+ " movq %%mm1, (%%eax, %%edx, 1) \n"
+
+ /* right half */
+ " movl %1, %%eax \n" // src
+ " movl %3, %%ebx \n" // sstr
+ " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
+
+ " movq 8(%%eax), %%mm0 \n"
+ " movq 8(%%eax), %%mm1 \n"
+ " paddsw 8(%%ecx), %%mm0 \n" // ss07s34
+ " psubsw 8(%%ecx), %%mm1 \n" // ss16s25
+ " pmulhw 32(%5), %%mm0 \n" // .7071
+ " pmulhw 32(%5), %%mm1 \n" // .7071
+
+ " movq 8(%%eax,%%ebx,2), %%mm2 \n"
+ " movq 8(%%eax,%%ebx,2), %%mm3 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm4 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm5 \n"
+ " pmulhw 16(%5), %%mm2 \n" // .9239
+ " pmulhw 48(%5), %%mm3 \n" // .3827
+ " pmulhw 48(%5), %%mm4 \n" // .3827
+ " pmulhw 16(%5), %%mm5 \n" // .9239
+ " paddsw %%mm4, %%mm2 \n" // ds07s34
+ " psubsw %%mm5, %%mm3 \n" // ds16s25
+
+ " movq %%mm0, %%mm4 \n"
+ " movq %%mm1, %%mm5 \n"
+ " paddsw %%mm2, %%mm0 \n" // s07
+ " psubsw %%mm2, %%mm4 \n" // s34
+ " paddsw %%mm3, %%mm1 \n" // s16
+ " psubsw %%mm3, %%mm5 \n" // s25
+
+ " movq %%mm0, 0(%4) \n"
+ " movq %%mm1, 8(%4) \n"
+ " movq %%mm5, 16(%4) \n"
+ " movq %%mm4, 24(%4) \n"
+
+ " addl %3, %%eax \n"
+ " addl %3, %%ecx \n"
+
+ " movq 8(%%eax), %%mm0 \n"
+ " pmulhw 8(%5), %%mm0 \n"
+ " movq 8(%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n"
+ " movq 8(%%ecx), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm0 \n" // d07
+
+ " movq 8(%%eax), %%mm2 \n"
+ " pmulhw 24(%5), %%mm2 \n"
+ " movq 8(%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n"
+ " movq 8(%%ecx), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm2 \n" // d16
+
+ " movq 8(%%eax), %%mm3 \n"
+ " pmulhw 40(%5), %%mm3 \n"
+ " movq 8(%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm3 \n"
+ " movq 8(%%ecx), %%mm1 \n"
+ " pmulhw 56(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm3 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm3 \n" // d25
+
+ " movq 8(%%eax), %%mm4 \n"
+ " pmulhw 56(%5), %%mm4 \n"
+ " movq 8(%%eax,%%ebx,2), %%mm1 \n"
+ " pmulhw 40(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm4 \n"
+ " movq 8(%%ecx), %%mm1 \n"
+ " pmulhw 24(%5), %%mm1 \n"
+ " paddsw %%mm1, %%mm4 \n"
+ " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
+ " pmulhw 8(%5), %%mm1 \n"
+ " psubsw %%mm1, %%mm4 \n" // d34
+
+ " movl %0, %%eax \n" // dest
+ " movl %2, %%ebx \n" // dstr
+ " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3
+
+ " movq %%mm0, %%mm1 \n"
+ " paddsw 0(%4), %%mm1 \n"
+ " movq %%mm1, 8(%%eax) \n"
+
+ " movq %%mm2, %%mm1 \n"
+ " paddsw 8(%4), %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%ebx, 1) \n"
+
+ " movq %%mm3, %%mm1 \n"
+ " paddsw 16(%4), %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%ebx, 2) \n" // s25 + d25
+
+ " movq %%mm4, %%mm1 \n"
+ " paddsw 24(%4), %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%edx, 1) \n"
+
+ " leal (%%eax, %%ebx, 4), %%eax \n"
+ " movq 24(%4), %%mm1 \n"
+ " psubsw %%mm4, %%mm1 \n"
+ " movq %%mm1, 8(%%eax) \n"
+
+ " movq 16(%4), %%mm1 \n"
+ " psubsw %%mm3, %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%ebx, 1) \n"
+
+ " movq 8(%4), %%mm1 \n"
+ " psubsw %%mm2, %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%ebx, 2) \n"
+
+ " movq 0(%4), %%mm1 \n"
+ " psubsw %%mm0, %%mm1 \n"
+ " movq %%mm1, 8(%%eax, %%edx, 1) \n"
+
+
+ /* rows */
+ " movl %0, %%eax \n" /* dest */
+#define LOOP \
+ " pshufw $0x88, 0(%%eax), %%mm0 \n" /* x0 x2 x0 x2 */ \
+ " pshufw $0x88, 8(%%eax), %%mm1 \n" /* x4 x6 x4 x6 */ \
+ " pmulhw 160(%5), %%mm0 \n" /* 0.707 0.9239 0.707 0.3827 */ \
+ " pmulhw 168(%5), %%mm1 \n" /* 0.707 0.3827 0.707 0.9239 */ \
+ " pmullw 64(%5), %%mm1 \n" /* 1 1 -1 -1 */ \
+ " paddsw %%mm1, %%mm0 \n" /* ss07s34 ds07s34 ss16s25 ds16s25 */ \
+ \
+ " pshufw $0xa0, %%mm0, %%mm1 \n" /* ss07s34 ss07s34 ss16s25 ss16s25 */ \
+ " pshufw $0xf5, %%mm0, %%mm2 \n" /* ds07s34 ds07s34 ds16s25 ds16s25 */ \
+ " pmullw 72(%5), %%mm2 \n" /* 1 -1 1 -1 */ \
+ " paddsw %%mm2, %%mm1 \n" /* s07 s34 s16 s25 */ \
+ " pshufw $0x78, %%mm1, %%mm2 \n" /* s07 s16 s25 s34 */ \
+ \
+ " pshufw $0x55, 0(%%eax), %%mm0 \n" \
+ " pmulhw 96(%5), %%mm0 \n" \
+ " pshufw $0xff, 0(%%eax), %%mm1 \n" \
+ " pmulhw 104(%5), %%mm1 \n" \
+ " pmullw 112(%5), %%mm1 \n" \
+ " paddsw %%mm1, %%mm0 \n" \
+ " pshufw $0x55, 8(%%eax), %%mm1 \n" \
+ " pmulhw 120(%5), %%mm1 \n" \
+ " pmullw 128(%5), %%mm1 \n" \
+ " paddsw %%mm1, %%mm0 \n" \
+ " pshufw $0xff, 8(%%eax), %%mm1 \n" \
+ " pmulhw 136(%5), %%mm1 \n" \
+ " pmullw 144(%5), %%mm1 \n" \
+ " paddsw %%mm1, %%mm0 \n" \
+ \
+ " movq %%mm2, %%mm1 \n" \
+ " paddsw %%mm0, %%mm1 \n" \
+ " psubsw %%mm0, %%mm2 \n" \
+ " pshufw $0x1b, %%mm2, %%mm2 \n" \
+ \
+ " movq %%mm1, 0(%%eax) \n" \
+ " movq %%mm2, 8(%%eax) \n" \
+ " addl %3, %%eax \n"
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " emms \n"
:
- : "ebx");
+ : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants)
+ : "eax", "ebx", "ecx", "edx");
}
OIL_DEFINE_IMPL_FULL (idct8x8_s16_mmx, idct8x8_s16, OIL_IMPL_FLAG_MMX);
-#endif
+#if 0
#define CONST(x) (32768.0*(x) + 0.5)
#define C1_0000 (32767)
@@ -87,6 +424,7 @@ dct_mmx_constants [][4] = {
{ C0_1951, C0_5556, C0_8315, C0_9808 }, // 136
{ 1, -1, 1, -1 },
};
+#endif
/* a 3dnow version can use pmulhrw instead of pmulhw for increased
* accuracy */
@@ -98,7 +436,6 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr)
asm volatile (
/* Note: this asm is unclean with %ebx, but it's not an issue
* in this particular case. */
-#if 1
/* first half */
" movl %1, %%eax \n" // src
" movl %3, %%ebx \n" // sstr
@@ -333,12 +670,8 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr)
" pmulhw 8(%5), %%mm1 \n"
" psubsw %%mm1, %%mm0 \n"
" movq %%mm0, (%%eax,%%edx) \n"
-#endif
-// " movl %1, %%eax \n" // src
" movl %0, %%ecx \n" // dest
-// " movl $8, %%edx \n"
-// "1: \n"
#define LOOP \
" movq (%%ecx), %%mm0 \n" \
@@ -397,9 +730,6 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr)
LOOP
LOOP
-// " decl %%edx \n"
-// " jne 1b\n"
-
" emms \n"
:
: "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants)
diff --git a/liboil/dct/idct8x8theora_ref.c b/liboil/dct/idct8x8theora_ref.c
new file mode 100644
index 0000000..8b0c50b
--- /dev/null
+++ b/liboil/dct/idct8x8theora_ref.c
@@ -0,0 +1,200 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2001,2002,2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboil.h>
+#include <liboil/liboiltest.h>
+#include <liboil/liboilrandom.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+static void
+idct8theora_s16_test (OilTest *test)
+{
+ int i;
+ int stride = test->params[OIL_ARG_SSTR1].value;
+ uint16_t *ptr = (uint16_t *)(test->params[OIL_ARG_SRC1].src_data +
+ OIL_TEST_HEADER);
+
+ for(i=0;i<8;i++){
+ OIL_GET(ptr, i*stride, int16_t) = oil_rand_s16() >> 3;
+ //OIL_GET(ptr, i*stride, int16_t) = 0;
+ }
+ //OIL_GET(ptr, 0*stride, int16_t) = 100;
+
+}
+
+static void
+idct8x8theora_s16_test (OilTest *test)
+{
+ int i;
+ int j;
+ int stride = test->params[OIL_ARG_SSTR1].value;
+ uint16_t *ptr = (uint16_t *)(test->params[OIL_ARG_SRC1].src_data +
+ OIL_TEST_HEADER);
+
+ for(i=0;i<8;i++){
+ for(j=0;j<8;j++){
+ OIL_GET(ptr, i*stride + j*2, int16_t) = oil_rand_s16() >> 3;
+ }
+ }
+
+}
+
+OIL_DEFINE_CLASS_FULL (idct8theora_s16, "int16_t *d_8, int dstr, int16_t *s_8, int sstr", idct8theora_s16_test);
+OIL_DEFINE_CLASS_FULL (idct8x8theora_s16, "int16_t *d_8x8, int dstr, int16_t *s_8x8, int sstr", idct8x8theora_s16_test);
+
+
+
+#define C1 64277
+#define C2 60547
+#define C3 54491
+#define C4 46341
+#define C5 36410
+#define C6 25080
+#define C7 12785
+
+#define S7 64277
+#define S6 60547
+#define S5 54491
+#define S4 46341
+#define S3 36410
+#define S2 25080
+#define S1 12785
+
+#define TRUNC(x) ((int16_t)x)
+#define MULT(a,b) (((a)*(b))>>16)
+
+static void
+idct8theora_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+ int32_t t[10];
+ int32_t r;
+
+#define Y(i) OIL_GET(src,sstr*(i),int16_t)
+#define X(i) OIL_GET(dest,sstr*(i),int16_t)
+
+ /* the ordering here corresponds closely to the theora spec */
+ t[0] = MULT(C4, Y(0) + Y(4));
+ t[0] = TRUNC(t[0]);
+ t[1] = MULT(C4, Y(0) - Y(4));
+ t[1] = TRUNC(t[1]);
+ t[2] = MULT(C6, Y(2)) - MULT(S6, Y(6));
+ t[3] = MULT(S6, Y(2)) + MULT(C6, Y(6));
+ t[4] = MULT(C7, Y(1)) - MULT(S7, Y(7));
+ t[5] = MULT(C3, Y(5)) - MULT(S3, Y(3));
+ t[6] = MULT(S3, Y(5)) + MULT(C3, Y(3));
+ t[7] = MULT(S7, Y(1)) + MULT(C7, Y(7));
+ r = t[4] + t[5];
+ t[5] = MULT(C4, t[4] - t[5]);
+ t[5] = TRUNC(t[5]);
+ t[4] = r;
+ r = t[7] + t[6];
+ t[6] = MULT(C4, t[7] - t[6]);
+ t[6] = TRUNC(t[6]);
+ t[7] = r;
+ r = t[0] + t[3];
+ t[3] = t[0] - t[3];
+ t[0] = r;
+ r = t[1] + t[2];
+ t[2] = t[1] - t[2];
+ t[1] = r;
+ r = t[6] + t[5];
+ t[5] = t[6] - t[5];
+ t[6] = r;
+ r = t[0] + t[7];
+ r = TRUNC(r);
+ X(0) = r;
+ r = t[1] + t[6];
+ r = TRUNC(r);
+ X(1) = r;
+ r = t[2] + t[5];
+ r = TRUNC(r);
+ X(2) = r;
+ r = t[3] + t[4];
+ r = TRUNC(r);
+ X(3) = r;
+ r = t[3] - t[4];
+ r = TRUNC(r);
+ X(4) = r;
+ r = t[2] - t[5];
+ r = TRUNC(r);
+ X(5) = r;
+ r = t[1] - t[6];
+ r = TRUNC(r);
+ X(6) = r;
+ r = t[0] - t[7];
+ r = TRUNC(r);
+ X(7) = r;
+}
+OIL_DEFINE_IMPL_REF (idct8theora_s16_ref, idct8theora_s16);
+
+
+#if defined(oil_idct8theora_s16)
+static void
+idct8x8theora_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+ int i;
+ int16_t tmp[64];
+
+ for(i=0;i<8;i++){
+ oil_idct8theora_s16(
+ OIL_OFFSET(tmp, 8*sizeof(int16_t) * i), sizeof(int16_t),
+ OIL_OFFSET(src, sstr * i), sizeof(int16_t));
+ }
+ for(i=0;i<8;i++){
+ oil_idct8theora_s16(
+ OIL_OFFSET(dest, sizeof(int16_t) * i), dstr,
+ OIL_OFFSET(tmp, sizeof(int16_t) * i), sizeof(int16_t) * i);
+ }
+}
+OIL_DEFINE_IMPL_REF (idct8x8theora_s16_ref, idct8x8theora_s16);
+#endif
+
+
+#if defined(oil_idct8_f64)
+static void
+idct8theora_s16_float (int16_t *dest, int dstr, int16_t *src, int sstr)
+{
+ int i;
+ double tmp1[8];
+ double tmp2[8];
+
+ oil_conv_f64_s16 (tmp1, sizeof(double), src, sizeof(int16_t), 8);
+ oil_idct8_f64 (tmp2, sizeof(double), tmp1, sizeof(double));
+ for(i=0;i<8;i++){
+ tmp2[i] *= 2.0;
+ }
+ oil_conv_s16_f64 (dest, sizeof(int16_t), tmp2, sizeof(double), 8);
+}
+OIL_DEFINE_IMPL_REF (idct8theora_s16_float, idct8theora_s16);
+#endif
+
+
diff --git a/liboil/liboilfuncs.h b/liboil/liboilfuncs.h
index 35d47a9..1913923 100644
--- a/liboil/liboilfuncs.h
+++ b/liboil/liboilfuncs.h
@@ -375,6 +375,9 @@ typedef void (*_oil_type_fdct8x8s_s16)(int16_t * d_8x8, int ds, const int16_t *
extern OilFunctionClass *oil_function_class_ptr_idct8_f64;
typedef void (*_oil_type_idct8_f64)(double * d_8, int dstr, const double * s_8, int sstr);
#define oil_idct8_f64 ((_oil_type_idct8_f64)(*(void **)oil_function_class_ptr_idct8_f64))
+extern OilFunctionClass *oil_function_class_ptr_idct8theora_s16;
+typedef void (*_oil_type_idct8theora_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr);
+#define oil_idct8theora_s16 ((_oil_type_idct8theora_s16)(*(void **)oil_function_class_ptr_idct8theora_s16))
extern OilFunctionClass *oil_function_class_ptr_idct8x8_f64;
typedef void (*_oil_type_idct8x8_f64)(double * d_8x8, int dstr, const double * s_8x8, int sstr);
#define oil_idct8x8_f64 ((_oil_type_idct8x8_f64)(*(void **)oil_function_class_ptr_idct8x8_f64))
@@ -387,6 +390,9 @@ typedef void (*_oil_type_idct8x8lim10_f64)(double * d_8x8, int dstr, const doubl
extern OilFunctionClass *oil_function_class_ptr_idct8x8lim10_s16;
typedef void (*_oil_type_idct8x8lim10_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr);
#define oil_idct8x8lim10_s16 ((_oil_type_idct8x8lim10_s16)(*(void **)oil_function_class_ptr_idct8x8lim10_s16))
+extern OilFunctionClass *oil_function_class_ptr_idct8x8theora_s16;
+typedef void (*_oil_type_idct8x8theora_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr);
+#define oil_idct8x8theora_s16 ((_oil_type_idct8x8theora_s16)(*(void **)oil_function_class_ptr_idct8x8theora_s16))
extern OilFunctionClass *oil_function_class_ptr_imdct12_f64;
typedef void (*_oil_type_imdct12_f64)(double * d_12, const double * s_6);
#define oil_imdct12_f64 ((_oil_type_imdct12_f64)(*(void **)oil_function_class_ptr_imdct12_f64))