diff options
author | David Schleef <ds@schleef.org> | 2005-04-30 06:00:56 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-04-30 06:00:56 +0000 |
commit | 81e196203fe61e8dedc5f10e03b9ec7c9d2a7eff (patch) | |
tree | ed85b753dc0cc8ef5c1083c2f704e150cd52b504 | |
parent | eb6ae36041277cde75b484d3e0fc35277f83a52a (diff) | |
download | liboil-81e196203fe61e8dedc5f10e03b9ec7c9d2a7eff.tar.gz |
* examples/Makefile.am: add oil-test
* examples/oil-test.c: A copy of work.c modified for displaying
test results for any class.
* liboil/dct/Makefile.am:
* liboil/dct/idct8x8_i386.c: (idct8x8_s16_mmx), (fdct8x8s_s16_mmx):
Add mmx code for idct
* liboil/dct/idct8x8theora_ref.c: Add some classes for idct8x8
to the theora spec.
* liboil/liboilfuncs.h: update
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | examples/Makefile.am | 6 | ||||
-rw-r--r-- | examples/oil-test.c | 201 | ||||
-rw-r--r-- | liboil/dct/Makefile.am | 3 | ||||
-rw-r--r-- | liboil/dct/idct8x8_i386.c | 356 | ||||
-rw-r--r-- | liboil/dct/idct8x8theora_ref.c | 200 | ||||
-rw-r--r-- | liboil/liboilfuncs.h | 6 |
7 files changed, 769 insertions, 15 deletions
@@ -1,3 +1,15 @@ +2005-04-29 David Schleef <ds@schleef.org> + + * examples/Makefile.am: add oil-test + * examples/oil-test.c: A copy of work.c modified for displaying + test results for any class. + * liboil/dct/Makefile.am: + * liboil/dct/idct8x8_i386.c: (idct8x8_s16_mmx), (fdct8x8s_s16_mmx): + Add mmx code for idct + * liboil/dct/idct8x8theora_ref.c: Add some classes for idct8x8 + to the theora spec. + * liboil/liboilfuncs.h: update + 2005-04-28 David Schleef <ds@schleef.org> Add an example huffman (variable code length) decoder diff --git a/examples/Makefile.am b/examples/Makefile.am index cb2eae1..9a1bda7 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,7 +1,7 @@ SUBDIRS = jpeg md5 uberopt work huffman -noinst_PROGRAMS = example1 oil-inspect +noinst_PROGRAMS = example1 oil-inspect oil-test example1_SOURCES = example1.c @@ -12,3 +12,7 @@ oil_inspect_SOURCES = oil-inspect.c oil_inspect_CFLAGS = $(LIBOIL_CFLAGS) oil_inspect_LDADD = $(LIBOIL_LIBS) +oil_test_SOURCES = oil-test.c +oil_test_CFLAGS = $(LIBOIL_CFLAGS) +oil_test_LDADD = $(LIBOIL_LIBS) + diff --git a/examples/oil-test.c b/examples/oil-test.c new file mode 100644 index 0000000..c7f369c --- /dev/null +++ b/examples/oil-test.c @@ -0,0 +1,201 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboil.h> +#include <liboil/liboilfunction.h> +#include <liboil/liboiltest.h> +#include <liboil/liboilrandom.h> +#include <liboil/liboilcpu.h> +#include <string.h> +#include <math.h> +#include <stdio.h> + +void register_impls(void); + +void test(void) +{ + int32_t dest[1]; + uint8_t src[100]; + int i; + + for(i=0;i<100;i++){ + src[i] = oil_rand_u8() & 0x7f; + } + dest[0] = 0; + + oil_utf8_validate (dest, src, 100); + +#if 0 + for(i=0;i<100;i++){ + printf("%d %d\n",dest[i],src[i]); + } +#endif + printf("%d\n", dest[0]); + +} + +void +dump_array (void *data, void *ref_data, OilType type, int pre_n, int stride, + int post_n) +{ + int i, j; + int s2 = oil_type_sizeof (type); + double x; + +#define DUMP(type, format) do { \ + for(i=0;i<post_n;i++){ \ + printf(" "); \ + for(j=0;j<pre_n;j++){ \ + x = fabs(OIL_GET(data, i*stride + j*s2, type) - \ + OIL_GET(data, i*stride + j*s2, type)); \ + if (x >= 1.0) { \ + printf("[" format "] ", OIL_GET(data, i*stride + j*s2, type)); \ + } else { \ + printf(format " ", OIL_GET(data, i*stride + j*s2, type)); \ + } \ + } \ + printf("\n"); \ + } \ +} while(0) + + switch(type) { + case OIL_TYPE_s8p: + DUMP(int8_t, "%d"); + break; + case OIL_TYPE_u8p: + DUMP(uint8_t, "%d"); + break; + case OIL_TYPE_s16p: + DUMP(int16_t, "%d"); + break; + case OIL_TYPE_u16p: + DUMP(uint16_t, "%d"); + break; + case OIL_TYPE_s32p: + DUMP(int32_t, "%d"); + break; + case OIL_TYPE_u32p: + DUMP(uint32_t, "%u"); + break; + case OIL_TYPE_f32p: + DUMP(float, "%g"); + break; + case OIL_TYPE_f64p: + DUMP(double, "%g"); + break; + default: + break; + } +} + +void +dump_test (OilTest *test) +{ + int i; + for(i=0;i<OIL_ARG_LAST;i++){ + OilParameter *p = &test->params[i]; + if (p->is_pointer) { + if (p->direction == 'i' || p->direction == 'd') { + printf (" %s:\n", p->parameter_name); + dump_array (p->test_data + OIL_TEST_HEADER, + p->ref_data + OIL_TEST_HEADER, + p->type, p->pre_n, p->stride, p->post_n); + } + } + } +} + +void +dump_source (OilTest *test) +{ + int i; + for(i=0;i<OIL_ARG_LAST;i++){ + OilParameter *p = &test->params[i]; + if (p->is_pointer) { + if (p->direction == 'i' || p->direction == 's') { + printf (" %s:\n", p->parameter_name); + dump_array (p->src_data + OIL_TEST_HEADER, + p->src_data + OIL_TEST_HEADER, + p->type, p->pre_n, p->stride, p->post_n); + } + } + } +} + +int main (int argc, char *argv[]) +{ + OilFunctionClass *klass; + OilFunctionImpl *impl; + OilTest *test; + double ave, std; + + oil_init (); + + if (argc < 2) { + printf("oil-test <class_name>\n"); + exit(0); + } + + klass = oil_class_get (argv[1]); + if (klass == NULL) { + printf("class not found: %s\n", argv[1]); + exit(0); + } + oil_class_optimize (klass); + + test = oil_test_new(klass); + oil_test_set_iterations(test, 1); + test->n = 10; + test->m = 10; + + impl = klass->reference_impl; + ave = impl->profile_ave; + std = impl->profile_std; + oil_test_check_impl (test, impl); + printf ("source array\n"); + dump_source(test); + printf ("reference impl %s\n", impl->name); + printf(" ave=%g std=%g\n", ave, std); + dump_test(test); + + for (impl = klass->first_impl; impl; impl = impl->next) { + if (impl == klass->reference_impl) continue; + printf ("impl %s\n", impl->name); + if (oil_impl_is_runnable (impl)) { + printf(" ave=%g std=%g\n", impl->profile_ave, impl->profile_std); + oil_test_check_impl (test, impl); + dump_test(test); + } + } + + return 0; +} + diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am index 9744e53..a407a45 100644 --- a/liboil/dct/Makefile.am +++ b/liboil/dct/Makefile.am @@ -25,7 +25,8 @@ c_sources = \ fdct8x8s_s16.c \ idct8_f64.c \ idct8x8_c.c \ - imdct32_f32.c + imdct32_f32.c \ + idct8x8theora_ref.c libdct_la_SOURCES = \ $(c_sources) \ diff --git a/liboil/dct/idct8x8_i386.c b/liboil/dct/idct8x8_i386.c index 0f68c72..c1d79e9 100644 --- a/liboil/dct/idct8x8_i386.c +++ b/liboil/dct/idct8x8_i386.c @@ -37,20 +37,357 @@ OIL_DECLARE_CLASS (idct8x8_s16); OIL_DECLARE_CLASS (dct8x8_s16); -#if 0 +#define CONST(x) (32768.0*(x) + 0.5) + +#define C1_0000 (32767) +#define C0_9808 CONST(0.980785280) +#define C0_9239 CONST(0.923879532) +#define C0_8315 CONST(0.831469612) +#define C0_7071 CONST(0.707106781) +#define C0_5556 CONST(0.555570233) +#define C0_3827 CONST(0.382683432) +#define C0_1951 CONST(0.195090322) + +#define FOUR(x) { x, x, x, x } +#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5} + +static const int16_t +dct_mmx_constants [][4] = { + FOUR(0), + FOUR(C0_9808), + FOUR(C0_9239), + FOUR(C0_8315), + FOUR(C0_7071), + FOUR(C0_5556), + FOUR(C0_3827), + FOUR(C0_1951), + { 1, 1, -1, -1 }, // 64 + { 1, -1, 1, -1 }, + { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80 + { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88 + { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96 + { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104 + { 1, -1, -1, -1 }, + { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120 + { 1, -1, 1, 1 }, + { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136 + { 1, -1, 1, -1 }, + FOUR(CONST(0.5)), //152 + { C0_7071, C0_9239, C0_7071, C0_3827 }, // 160 + { C0_7071, C0_3827, C0_7071, C0_9239 }, // 168 +}; + static void idct8x8_s16_mmx (int16_t *dest, int dstr, int16_t *src, int sstr) { + int32_t tmp[32]; + asm volatile ( - "" + /* left half */ + " movl %1, %%eax \n" // src + " movl %3, %%ebx \n" // sstr + " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 + + " movq (%%eax), %%mm0 \n" + " movq (%%eax), %%mm1 \n" + " paddsw (%%ecx), %%mm0 \n" // ss07s34 + " psubsw (%%ecx), %%mm1 \n" // ss16s25 + " pmulhw 32(%5), %%mm0 \n" // .7071 + " pmulhw 32(%5), %%mm1 \n" // .7071 + + " movq (%%eax,%%ebx,2), %%mm2 \n" + " movq (%%eax,%%ebx,2), %%mm3 \n" + " movq (%%ecx,%%ebx,2), %%mm4 \n" + " movq (%%ecx,%%ebx,2), %%mm5 \n" + " pmulhw 16(%5), %%mm2 \n" // .9239 + " pmulhw 48(%5), %%mm3 \n" // .3827 + " pmulhw 48(%5), %%mm4 \n" // .3827 + " pmulhw 16(%5), %%mm5 \n" // .9239 + " paddsw %%mm4, %%mm2 \n" // ds07s34 + " psubsw %%mm5, %%mm3 \n" // ds16s25 + + " movq %%mm0, %%mm4 \n" + " movq %%mm1, %%mm5 \n" + " paddsw %%mm2, %%mm0 \n" // s07 + " psubsw %%mm2, %%mm4 \n" // s34 + " paddsw %%mm3, %%mm1 \n" // s16 + " psubsw %%mm3, %%mm5 \n" // s25 + + " movq %%mm0, 0(%4) \n" + " movq %%mm1, 8(%4) \n" + " movq %%mm5, 16(%4) \n" + " movq %%mm4, 24(%4) \n" + + " addl %3, %%eax \n" + " addl %3, %%ecx \n" + + " movq (%%eax), %%mm0 \n" + " pmulhw 8(%5), %%mm0 \n" + " movq (%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" + " movq (%%ecx), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" + " movq (%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" // d07 + + " movq (%%eax), %%mm2 \n" + " pmulhw 24(%5), %%mm2 \n" + " movq (%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" + " movq (%%ecx), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" + " movq (%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" // d16 + + " movq (%%eax), %%mm3 \n" + " pmulhw 40(%5), %%mm3 \n" + " movq (%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm3 \n" + " movq (%%ecx), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " paddsw %%mm1, %%mm3 \n" + " movq (%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm3 \n" // d25 + + " movq (%%eax), %%mm4 \n" + " pmulhw 56(%5), %%mm4 \n" + " movq (%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " psubsw %%mm1, %%mm4 \n" + " movq (%%ecx), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm4 \n" + " movq (%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm4 \n" // d34 + + " movl %0, %%eax \n" // dest + " movl %2, %%ebx \n" // dstr + " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3 + + " movq %%mm0, %%mm1 \n" + " paddsw 0(%4), %%mm1 \n" + " movq %%mm1, (%%eax) \n" + + " movq %%mm2, %%mm1 \n" + " paddsw 8(%4), %%mm1 \n" + " movq %%mm1, (%%eax, %%ebx, 1) \n" + + " movq %%mm3, %%mm1 \n" + " paddsw 16(%4), %%mm1 \n" + " movq %%mm1, (%%eax, %%ebx, 2) \n" // s25 + d25 + + " movq %%mm4, %%mm1 \n" + " paddsw 24(%4), %%mm1 \n" + " movq %%mm1, (%%eax, %%edx, 1) \n" - : "+r" (dest), "+r" (src), "+r" (dstr), "+r" (sstr) + " leal (%%eax, %%ebx, 4), %%eax \n" + " movq 24(%4), %%mm1 \n" + " psubsw %%mm4, %%mm1 \n" + " movq %%mm1, (%%eax) \n" + + " movq 16(%4), %%mm1 \n" + " psubsw %%mm3, %%mm1 \n" + " movq %%mm1, (%%eax, %%ebx, 1) \n" + + " movq 8(%4), %%mm1 \n" + " psubsw %%mm2, %%mm1 \n" + " movq %%mm1, (%%eax, %%ebx, 2) \n" + + " movq 0(%4), %%mm1 \n" + " psubsw %%mm0, %%mm1 \n" + " movq %%mm1, (%%eax, %%edx, 1) \n" + + /* right half */ + " movl %1, %%eax \n" // src + " movl %3, %%ebx \n" // sstr + " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4 + + " movq 8(%%eax), %%mm0 \n" + " movq 8(%%eax), %%mm1 \n" + " paddsw 8(%%ecx), %%mm0 \n" // ss07s34 + " psubsw 8(%%ecx), %%mm1 \n" // ss16s25 + " pmulhw 32(%5), %%mm0 \n" // .7071 + " pmulhw 32(%5), %%mm1 \n" // .7071 + + " movq 8(%%eax,%%ebx,2), %%mm2 \n" + " movq 8(%%eax,%%ebx,2), %%mm3 \n" + " movq 8(%%ecx,%%ebx,2), %%mm4 \n" + " movq 8(%%ecx,%%ebx,2), %%mm5 \n" + " pmulhw 16(%5), %%mm2 \n" // .9239 + " pmulhw 48(%5), %%mm3 \n" // .3827 + " pmulhw 48(%5), %%mm4 \n" // .3827 + " pmulhw 16(%5), %%mm5 \n" // .9239 + " paddsw %%mm4, %%mm2 \n" // ds07s34 + " psubsw %%mm5, %%mm3 \n" // ds16s25 + + " movq %%mm0, %%mm4 \n" + " movq %%mm1, %%mm5 \n" + " paddsw %%mm2, %%mm0 \n" // s07 + " psubsw %%mm2, %%mm4 \n" // s34 + " paddsw %%mm3, %%mm1 \n" // s16 + " psubsw %%mm3, %%mm5 \n" // s25 + + " movq %%mm0, 0(%4) \n" + " movq %%mm1, 8(%4) \n" + " movq %%mm5, 16(%4) \n" + " movq %%mm4, 24(%4) \n" + + " addl %3, %%eax \n" + " addl %3, %%ecx \n" + + " movq 8(%%eax), %%mm0 \n" + " pmulhw 8(%5), %%mm0 \n" + " movq 8(%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" + " movq 8(%%ecx), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" + " movq 8(%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " paddsw %%mm1, %%mm0 \n" // d07 + + " movq 8(%%eax), %%mm2 \n" + " pmulhw 24(%5), %%mm2 \n" + " movq 8(%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" + " movq 8(%%ecx), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" + " movq 8(%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " psubsw %%mm1, %%mm2 \n" // d16 + + " movq 8(%%eax), %%mm3 \n" + " pmulhw 40(%5), %%mm3 \n" + " movq 8(%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm3 \n" + " movq 8(%%ecx), %%mm1 \n" + " pmulhw 56(%5), %%mm1 \n" + " paddsw %%mm1, %%mm3 \n" + " movq 8(%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm3 \n" // d25 + + " movq 8(%%eax), %%mm4 \n" + " pmulhw 56(%5), %%mm4 \n" + " movq 8(%%eax,%%ebx,2), %%mm1 \n" + " pmulhw 40(%5), %%mm1 \n" + " psubsw %%mm1, %%mm4 \n" + " movq 8(%%ecx), %%mm1 \n" + " pmulhw 24(%5), %%mm1 \n" + " paddsw %%mm1, %%mm4 \n" + " movq 8(%%ecx,%%ebx,2), %%mm1 \n" + " pmulhw 8(%5), %%mm1 \n" + " psubsw %%mm1, %%mm4 \n" // d34 + + " movl %0, %%eax \n" // dest + " movl %2, %%ebx \n" // dstr + " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3 + + " movq %%mm0, %%mm1 \n" + " paddsw 0(%4), %%mm1 \n" + " movq %%mm1, 8(%%eax) \n" + + " movq %%mm2, %%mm1 \n" + " paddsw 8(%4), %%mm1 \n" + " movq %%mm1, 8(%%eax, %%ebx, 1) \n" + + " movq %%mm3, %%mm1 \n" + " paddsw 16(%4), %%mm1 \n" + " movq %%mm1, 8(%%eax, %%ebx, 2) \n" // s25 + d25 + + " movq %%mm4, %%mm1 \n" + " paddsw 24(%4), %%mm1 \n" + " movq %%mm1, 8(%%eax, %%edx, 1) \n" + + " leal (%%eax, %%ebx, 4), %%eax \n" + " movq 24(%4), %%mm1 \n" + " psubsw %%mm4, %%mm1 \n" + " movq %%mm1, 8(%%eax) \n" + + " movq 16(%4), %%mm1 \n" + " psubsw %%mm3, %%mm1 \n" + " movq %%mm1, 8(%%eax, %%ebx, 1) \n" + + " movq 8(%4), %%mm1 \n" + " psubsw %%mm2, %%mm1 \n" + " movq %%mm1, 8(%%eax, %%ebx, 2) \n" + + " movq 0(%4), %%mm1 \n" + " psubsw %%mm0, %%mm1 \n" + " movq %%mm1, 8(%%eax, %%edx, 1) \n" + + + /* rows */ + " movl %0, %%eax \n" /* dest */ +#define LOOP \ + " pshufw $0x88, 0(%%eax), %%mm0 \n" /* x0 x2 x0 x2 */ \ + " pshufw $0x88, 8(%%eax), %%mm1 \n" /* x4 x6 x4 x6 */ \ + " pmulhw 160(%5), %%mm0 \n" /* 0.707 0.9239 0.707 0.3827 */ \ + " pmulhw 168(%5), %%mm1 \n" /* 0.707 0.3827 0.707 0.9239 */ \ + " pmullw 64(%5), %%mm1 \n" /* 1 1 -1 -1 */ \ + " paddsw %%mm1, %%mm0 \n" /* ss07s34 ds07s34 ss16s25 ds16s25 */ \ + \ + " pshufw $0xa0, %%mm0, %%mm1 \n" /* ss07s34 ss07s34 ss16s25 ss16s25 */ \ + " pshufw $0xf5, %%mm0, %%mm2 \n" /* ds07s34 ds07s34 ds16s25 ds16s25 */ \ + " pmullw 72(%5), %%mm2 \n" /* 1 -1 1 -1 */ \ + " paddsw %%mm2, %%mm1 \n" /* s07 s34 s16 s25 */ \ + " pshufw $0x78, %%mm1, %%mm2 \n" /* s07 s16 s25 s34 */ \ + \ + " pshufw $0x55, 0(%%eax), %%mm0 \n" \ + " pmulhw 96(%5), %%mm0 \n" \ + " pshufw $0xff, 0(%%eax), %%mm1 \n" \ + " pmulhw 104(%5), %%mm1 \n" \ + " pmullw 112(%5), %%mm1 \n" \ + " paddsw %%mm1, %%mm0 \n" \ + " pshufw $0x55, 8(%%eax), %%mm1 \n" \ + " pmulhw 120(%5), %%mm1 \n" \ + " pmullw 128(%5), %%mm1 \n" \ + " paddsw %%mm1, %%mm0 \n" \ + " pshufw $0xff, 8(%%eax), %%mm1 \n" \ + " pmulhw 136(%5), %%mm1 \n" \ + " pmullw 144(%5), %%mm1 \n" \ + " paddsw %%mm1, %%mm0 \n" \ + \ + " movq %%mm2, %%mm1 \n" \ + " paddsw %%mm0, %%mm1 \n" \ + " psubsw %%mm0, %%mm2 \n" \ + " pshufw $0x1b, %%mm2, %%mm2 \n" \ + \ + " movq %%mm1, 0(%%eax) \n" \ + " movq %%mm2, 8(%%eax) \n" \ + " addl %3, %%eax \n" + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " emms \n" : - : "ebx"); + : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants) + : "eax", "ebx", "ecx", "edx"); } OIL_DEFINE_IMPL_FULL (idct8x8_s16_mmx, idct8x8_s16, OIL_IMPL_FLAG_MMX); -#endif +#if 0 #define CONST(x) (32768.0*(x) + 0.5) #define C1_0000 (32767) @@ -87,6 +424,7 @@ dct_mmx_constants [][4] = { { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136 { 1, -1, 1, -1 }, }; +#endif /* a 3dnow version can use pmulhrw instead of pmulhw for increased * accuracy */ @@ -98,7 +436,6 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr) asm volatile ( /* Note: this asm is unclean with %ebx, but it's not an issue * in this particular case. */ -#if 1 /* first half */ " movl %1, %%eax \n" // src " movl %3, %%ebx \n" // sstr @@ -333,12 +670,8 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr) " pmulhw 8(%5), %%mm1 \n" " psubsw %%mm1, %%mm0 \n" " movq %%mm0, (%%eax,%%edx) \n" -#endif -// " movl %1, %%eax \n" // src " movl %0, %%ecx \n" // dest -// " movl $8, %%edx \n" -// "1: \n" #define LOOP \ " movq (%%ecx), %%mm0 \n" \ @@ -397,9 +730,6 @@ fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr) LOOP LOOP -// " decl %%edx \n" -// " jne 1b\n" - " emms \n" : : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants) diff --git a/liboil/dct/idct8x8theora_ref.c b/liboil/dct/idct8x8theora_ref.c new file mode 100644 index 0000000..8b0c50b --- /dev/null +++ b/liboil/dct/idct8x8theora_ref.c @@ -0,0 +1,200 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2001,2002,2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboil.h> +#include <liboil/liboiltest.h> +#include <liboil/liboilrandom.h> +#include <liboil/dct/dct.h> +#include <math.h> + +static void +idct8theora_s16_test (OilTest *test) +{ + int i; + int stride = test->params[OIL_ARG_SSTR1].value; + uint16_t *ptr = (uint16_t *)(test->params[OIL_ARG_SRC1].src_data + + OIL_TEST_HEADER); + + for(i=0;i<8;i++){ + OIL_GET(ptr, i*stride, int16_t) = oil_rand_s16() >> 3; + //OIL_GET(ptr, i*stride, int16_t) = 0; + } + //OIL_GET(ptr, 0*stride, int16_t) = 100; + +} + +static void +idct8x8theora_s16_test (OilTest *test) +{ + int i; + int j; + int stride = test->params[OIL_ARG_SSTR1].value; + uint16_t *ptr = (uint16_t *)(test->params[OIL_ARG_SRC1].src_data + + OIL_TEST_HEADER); + + for(i=0;i<8;i++){ + for(j=0;j<8;j++){ + OIL_GET(ptr, i*stride + j*2, int16_t) = oil_rand_s16() >> 3; + } + } + +} + +OIL_DEFINE_CLASS_FULL (idct8theora_s16, "int16_t *d_8, int dstr, int16_t *s_8, int sstr", idct8theora_s16_test); +OIL_DEFINE_CLASS_FULL (idct8x8theora_s16, "int16_t *d_8x8, int dstr, int16_t *s_8x8, int sstr", idct8x8theora_s16_test); + + + +#define C1 64277 +#define C2 60547 +#define C3 54491 +#define C4 46341 +#define C5 36410 +#define C6 25080 +#define C7 12785 + +#define S7 64277 +#define S6 60547 +#define S5 54491 +#define S4 46341 +#define S3 36410 +#define S2 25080 +#define S1 12785 + +#define TRUNC(x) ((int16_t)x) +#define MULT(a,b) (((a)*(b))>>16) + +static void +idct8theora_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr) +{ + int32_t t[10]; + int32_t r; + +#define Y(i) OIL_GET(src,sstr*(i),int16_t) +#define X(i) OIL_GET(dest,sstr*(i),int16_t) + + /* the ordering here corresponds closely to the theora spec */ + t[0] = MULT(C4, Y(0) + Y(4)); + t[0] = TRUNC(t[0]); + t[1] = MULT(C4, Y(0) - Y(4)); + t[1] = TRUNC(t[1]); + t[2] = MULT(C6, Y(2)) - MULT(S6, Y(6)); + t[3] = MULT(S6, Y(2)) + MULT(C6, Y(6)); + t[4] = MULT(C7, Y(1)) - MULT(S7, Y(7)); + t[5] = MULT(C3, Y(5)) - MULT(S3, Y(3)); + t[6] = MULT(S3, Y(5)) + MULT(C3, Y(3)); + t[7] = MULT(S7, Y(1)) + MULT(C7, Y(7)); + r = t[4] + t[5]; + t[5] = MULT(C4, t[4] - t[5]); + t[5] = TRUNC(t[5]); + t[4] = r; + r = t[7] + t[6]; + t[6] = MULT(C4, t[7] - t[6]); + t[6] = TRUNC(t[6]); + t[7] = r; + r = t[0] + t[3]; + t[3] = t[0] - t[3]; + t[0] = r; + r = t[1] + t[2]; + t[2] = t[1] - t[2]; + t[1] = r; + r = t[6] + t[5]; + t[5] = t[6] - t[5]; + t[6] = r; + r = t[0] + t[7]; + r = TRUNC(r); + X(0) = r; + r = t[1] + t[6]; + r = TRUNC(r); + X(1) = r; + r = t[2] + t[5]; + r = TRUNC(r); + X(2) = r; + r = t[3] + t[4]; + r = TRUNC(r); + X(3) = r; + r = t[3] - t[4]; + r = TRUNC(r); + X(4) = r; + r = t[2] - t[5]; + r = TRUNC(r); + X(5) = r; + r = t[1] - t[6]; + r = TRUNC(r); + X(6) = r; + r = t[0] - t[7]; + r = TRUNC(r); + X(7) = r; +} +OIL_DEFINE_IMPL_REF (idct8theora_s16_ref, idct8theora_s16); + + +#if defined(oil_idct8theora_s16) +static void +idct8x8theora_s16_ref (int16_t *dest, int dstr, int16_t *src, int sstr) +{ + int i; + int16_t tmp[64]; + + for(i=0;i<8;i++){ + oil_idct8theora_s16( + OIL_OFFSET(tmp, 8*sizeof(int16_t) * i), sizeof(int16_t), + OIL_OFFSET(src, sstr * i), sizeof(int16_t)); + } + for(i=0;i<8;i++){ + oil_idct8theora_s16( + OIL_OFFSET(dest, sizeof(int16_t) * i), dstr, + OIL_OFFSET(tmp, sizeof(int16_t) * i), sizeof(int16_t) * i); + } +} +OIL_DEFINE_IMPL_REF (idct8x8theora_s16_ref, idct8x8theora_s16); +#endif + + +#if defined(oil_idct8_f64) +static void +idct8theora_s16_float (int16_t *dest, int dstr, int16_t *src, int sstr) +{ + int i; + double tmp1[8]; + double tmp2[8]; + + oil_conv_f64_s16 (tmp1, sizeof(double), src, sizeof(int16_t), 8); + oil_idct8_f64 (tmp2, sizeof(double), tmp1, sizeof(double)); + for(i=0;i<8;i++){ + tmp2[i] *= 2.0; + } + oil_conv_s16_f64 (dest, sizeof(int16_t), tmp2, sizeof(double), 8); +} +OIL_DEFINE_IMPL_REF (idct8theora_s16_float, idct8theora_s16); +#endif + + diff --git a/liboil/liboilfuncs.h b/liboil/liboilfuncs.h index 35d47a9..1913923 100644 --- a/liboil/liboilfuncs.h +++ b/liboil/liboilfuncs.h @@ -375,6 +375,9 @@ typedef void (*_oil_type_fdct8x8s_s16)(int16_t * d_8x8, int ds, const int16_t * extern OilFunctionClass *oil_function_class_ptr_idct8_f64; typedef void (*_oil_type_idct8_f64)(double * d_8, int dstr, const double * s_8, int sstr); #define oil_idct8_f64 ((_oil_type_idct8_f64)(*(void **)oil_function_class_ptr_idct8_f64)) +extern OilFunctionClass *oil_function_class_ptr_idct8theora_s16; +typedef void (*_oil_type_idct8theora_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr); +#define oil_idct8theora_s16 ((_oil_type_idct8theora_s16)(*(void **)oil_function_class_ptr_idct8theora_s16)) extern OilFunctionClass *oil_function_class_ptr_idct8x8_f64; typedef void (*_oil_type_idct8x8_f64)(double * d_8x8, int dstr, const double * s_8x8, int sstr); #define oil_idct8x8_f64 ((_oil_type_idct8x8_f64)(*(void **)oil_function_class_ptr_idct8x8_f64)) @@ -387,6 +390,9 @@ typedef void (*_oil_type_idct8x8lim10_f64)(double * d_8x8, int dstr, const doubl extern OilFunctionClass *oil_function_class_ptr_idct8x8lim10_s16; typedef void (*_oil_type_idct8x8lim10_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr); #define oil_idct8x8lim10_s16 ((_oil_type_idct8x8lim10_s16)(*(void **)oil_function_class_ptr_idct8x8lim10_s16)) +extern OilFunctionClass *oil_function_class_ptr_idct8x8theora_s16; +typedef void (*_oil_type_idct8x8theora_s16)(int16_t * d_8x8, int dstr, const int16_t * s_8x8, int sstr); +#define oil_idct8x8theora_s16 ((_oil_type_idct8x8theora_s16)(*(void **)oil_function_class_ptr_idct8x8theora_s16)) extern OilFunctionClass *oil_function_class_ptr_imdct12_f64; typedef void (*_oil_type_imdct12_f64)(double * d_12, const double * s_6); #define oil_imdct12_f64 ((_oil_type_imdct12_f64)(*(void **)oil_function_class_ptr_imdct12_f64)) |