diff options
author | David Schleef <ds@schleef.org> | 2005-08-03 03:31:18 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-08-03 03:31:18 +0000 |
commit | c1f358f68635378f5bbdf54b066d9072e8c18a82 (patch) | |
tree | 682267b921428cdd052d2725c5082e77ad1099f3 | |
parent | 8e630a5dfd9e57732fa1c09772846a3fcdf0adf5 (diff) | |
download | liboil-c1f358f68635378f5bbdf54b066d9072e8c18a82.tar.gz |
Patch from Wim Taymans adding a bunch of classes and MMX
implementations for libtheora. Heavily modified by ds.
* Makefile.am:
* liboil-uninstalled.pc.in:
* liboil/copy/Makefile.am:
* liboil/copy/copy.c:
* liboil/copy/copy8x8.c:
* liboil/copy/copy8x8_i386.c:
* liboil/dct/Makefile.am:
* liboil/dct/fdct8x8theora.c:
* liboil/dct/fdct8x8theora_i386.c:
* liboil/i386/Makefile.am:
* liboil/i386/diff8x8_i386.c:
* liboil/i386/error8x8_i386.c:
* liboil/i386/recon8x8_i386.c:
* liboil/i386/rowcolsad8x8_i386.c:
* liboil/i386/sad8x8_i386.c:
* liboil/i386/sad8x8avg_i386.c:
* liboil/ref/Makefile.am:
* liboil/ref/diff8x8.c:
* liboil/ref/error8x8.c:
* liboil/ref/recon8x8.c:
* liboil/ref/rowcolsad8x8.c:
* liboil/ref/sad8x8avg.c:
-rw-r--r-- | ChangeLog | 27 | ||||
-rw-r--r-- | Makefile.am | 7 | ||||
-rw-r--r-- | liboil-uninstalled.pc.in | 10 | ||||
-rw-r--r-- | liboil/copy/Makefile.am | 2 | ||||
-rw-r--r-- | liboil/copy/copy.c | 1 | ||||
-rw-r--r-- | liboil/copy/copy8x8.c | 63 | ||||
-rw-r--r-- | liboil/copy/copy8x8_i386.c | 76 | ||||
-rw-r--r-- | liboil/dct/Makefile.am | 4 | ||||
-rw-r--r-- | liboil/dct/fdct8x8theora.c | 294 | ||||
-rw-r--r-- | liboil/dct/fdct8x8theora_i386.c | 357 | ||||
-rw-r--r-- | liboil/i386/Makefile.am | 27 | ||||
-rw-r--r-- | liboil/i386/diff8x8_i386.c | 169 | ||||
-rw-r--r-- | liboil/i386/error8x8_i386.c | 337 | ||||
-rw-r--r-- | liboil/i386/recon8x8_i386.c | 165 | ||||
-rw-r--r-- | liboil/i386/rowcolsad8x8_i386.c | 280 | ||||
-rw-r--r-- | liboil/i386/sad8x8_i386.c | 120 | ||||
-rw-r--r-- | liboil/i386/sad8x8avg_i386.c | 136 | ||||
-rw-r--r-- | liboil/ref/Makefile.am | 26 | ||||
-rw-r--r-- | liboil/ref/diff8x8.c | 117 | ||||
-rw-r--r-- | liboil/ref/error8x8.c | 181 | ||||
-rw-r--r-- | liboil/ref/recon8x8.c | 112 | ||||
-rw-r--r-- | liboil/ref/rowcolsad8x8.c | 110 | ||||
-rw-r--r-- | liboil/ref/sad8x8avg.c | 66 |
23 files changed, 2684 insertions, 3 deletions
@@ -1,5 +1,32 @@ 2005-08-02 David Schleef <ds@schleef.org> + Patch from Wim Taymans adding a bunch of classes and MMX + implementations for libtheora. Heavily modified by ds. + * Makefile.am: + * liboil-uninstalled.pc.in: + * liboil/copy/Makefile.am: + * liboil/copy/copy.c: + * liboil/copy/copy8x8.c: + * liboil/copy/copy8x8_i386.c: + * liboil/dct/Makefile.am: + * liboil/dct/fdct8x8theora.c: + * liboil/dct/fdct8x8theora_i386.c: + * liboil/i386/Makefile.am: + * liboil/i386/diff8x8_i386.c: + * liboil/i386/error8x8_i386.c: + * liboil/i386/recon8x8_i386.c: + * liboil/i386/rowcolsad8x8_i386.c: + * liboil/i386/sad8x8_i386.c: + * liboil/i386/sad8x8avg_i386.c: + * liboil/ref/Makefile.am: + * liboil/ref/diff8x8.c: + * liboil/ref/error8x8.c: + * liboil/ref/recon8x8.c: + * liboil/ref/rowcolsad8x8.c: + * liboil/ref/sad8x8avg.c: + +2005-08-02 David Schleef <ds@schleef.org> + * liboil/Makefile.am: add libcolorspace.h * liboil/build_marshal.c: (main): use oil_init_no_optimize() to save us from horrible build problems (like what happened today) diff --git a/Makefile.am b/Makefile.am index 3d35d50..57e5932 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,7 +10,12 @@ pkgconfig_DATA = liboil-$(LIBOIL_MAJORMINOR).pc liboil-$(LIBOIL_MAJORMINOR).pc: liboil.pc cp liboil.pc liboil-$(LIBOIL_MAJORMINOR).pc -CLEANFILES = liboil-$(LIBOIL_MAJORMINOR).pc +liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc: liboil-uninstalled.pc + cp liboil-uninstalled.pc liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc + +BUILT_SOURCES=liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc + +CLEANFILES = liboil-$(LIBOIL_MAJORMINOR).pc liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc ACLOCAL_FLAGS = -I m4 diff --git a/liboil-uninstalled.pc.in b/liboil-uninstalled.pc.in new file mode 100644 index 0000000..de72faf --- /dev/null +++ b/liboil-uninstalled.pc.in @@ -0,0 +1,10 @@ +prefix= +exec_prefix= +libdir=${pcfiledir}/liboil/ +includedir=${pcfiledir}/ + +Name: liboil-@LIBOIL_MAJORMINOR@ uninstalled +Description: Libaray of Optimized Inner Loops +Version: @VERSION@ +Libs: -L${libdir} -loil-@LIBOIL_MAJORMINOR@ -lm +Cflags: -I${includedir} diff --git a/liboil/copy/Makefile.am b/liboil/copy/Makefile.am index 73dd865..5905f98 100644 --- a/liboil/copy/Makefile.am +++ b/liboil/copy/Makefile.am @@ -18,6 +18,7 @@ endif if HAVE_CPU_I386 i386_sources = \ copy_i386.c \ + copy8x8_i386.c \ splat_i386.c \ trans8x8_i386.c else @@ -35,6 +36,7 @@ endif c_sources = \ copy.c \ + copy8x8.c \ permute.c \ splat_ref.c \ tablelookup_ref.c \ diff --git a/liboil/copy/copy.c b/liboil/copy/copy.c index 26aea18..84295da 100644 --- a/liboil/copy/copy.c +++ b/liboil/copy/copy.c @@ -77,4 +77,3 @@ copy_u8_ints (uint8_t *dest, uint8_t *src, int n) } OIL_DEFINE_IMPL (copy_u8_ints, copy_u8); - diff --git a/liboil/copy/copy8x8.c b/liboil/copy/copy8x8.c new file mode 100644 index 0000000..561132c --- /dev/null +++ b/liboil/copy/copy8x8.c @@ -0,0 +1,63 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DEFINE_CLASS (copy8x8_u8, "uint8_t *d_8x8, int ds, uint8_t *s_8x8, int ss"); + +static void +copy8x8_u8_ref (uint8_t *d1, int ds, uint8_t *s1, int ss) +{ + int i,j; + for (i=0;i<8;i++){ + for (j=0;j<8;j++){ + d1[j] = s1[j]; + } + d1 += ds; + s1 += ss; + } +} +OIL_DEFINE_IMPL_REF (copy8x8_u8_ref, copy8x8_u8); + +static void +copy8x8_u8_ints (uint8_t *d1, int ds, uint8_t *s1, int ss) +{ + int j; + for (j=0;j<8;j++){ + ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0]; + ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1]; + + d1+=ds; + s1+=ss; + } +} +OIL_DEFINE_IMPL (copy8x8_u8_ints, copy8x8_u8); + diff --git a/liboil/copy/copy8x8_i386.c b/liboil/copy/copy8x8_i386.c new file mode 100644 index 0000000..fd3dec9 --- /dev/null +++ b/liboil/copy/copy8x8_i386.c @@ -0,0 +1,76 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS(copy8x8_u8); + +static void +copy8x8_u8_mmx (uint8_t *dest, int dstr, uint8_t *src, int sstr) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " lea (%2, %2, 2), %%edi \n\t" + + " movq (%1), %%mm0 \n\t" + " movq (%1, %2), %%mm1 \n\t" + " movq (%1, %2, 2), %%mm2 \n\t" + " movq (%1, %%edi), %%mm3 \n\t" + + " lea (%1, %2, 4), %1 \n\t" + + " movq %%mm0, (%0) \n\t" + " movq %%mm1, (%0, %2) \n\t" + " movq %%mm2, (%0, %2, 2) \n\t" + " movq %%mm3, (%0, %%edi) \n\t" + + " lea (%0, %2, 4), %0 \n\t" + + " movq (%1), %%mm0 \n\t" + " movq (%1, %2), %%mm1 \n\t" + " movq (%1, %2, 2), %%mm2 \n\t" + " movq (%1, %%edi), %%mm3 \n\t" + + " movq %%mm0, (%0) \n\t" + " movq %%mm1, (%0, %2) \n\t" + " movq %%mm2, (%0, %2, 2) \n\t" + " movq %%mm3, (%0, %%edi) \n\t" + " emms \n\t" + : "+a" (dest) + : "c" (src), + "r" (sstr), + "r" (dstr) + : "memory", "edi" + ); +} +OIL_DEFINE_IMPL_FULL (copy8x8_u8_mmx, copy8x8_u8, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am index 9183724..af92bae 100644 --- a/liboil/dct/Makefile.am +++ b/liboil/dct/Makefile.am @@ -12,7 +12,8 @@ noinst_HEADERS = \ if HAVE_CPU_I386 i386_sources = \ - idct8x8_i386.c + idct8x8_i386.c \ + fdct8x8theora_i386.c else i386_sources = endif @@ -30,6 +31,7 @@ c_sources = \ fdct8_f64.c \ fdct8x8_f64.c \ fdct8x8s_s16.c \ + fdct8x8theora.c \ idct8_f64.c \ idct8x8_c.c \ imdct32_f32.c \ diff --git a/liboil/dct/fdct8x8theora.c b/liboil/dct/fdct8x8theora.c new file mode 100644 index 0000000..b485525 --- /dev/null +++ b/liboil/dct/fdct8x8theora.c @@ -0,0 +1,294 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: fdct8x8theora.c,v 1.1 2005-08-03 03:31:18 ds Exp $ + + ********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilfuncs.h> +#include <liboil/dct/dct.h> +#include <math.h> + +static int32_t xC1S7 = 64277; +static int32_t xC2S6 = 60547; +static int32_t xC3S5 = 54491; +static int32_t xC4S4 = 46341; +static int32_t xC5S3 = 36410; +static int32_t xC6S2 = 25080; +static int32_t xC7S1 = 12785; + +#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31) +#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) ) + +OIL_DEFINE_CLASS(fdct8x8theora, "int16_t *s_8x8, int16_t *d_8x8"); + +static void +fdct8x8theora_ref(int16_t *src, int16_t *dest) +{ + int loop; + + int32_t is07, is12, is34, is56; + int32_t is0734, is1256; + int32_t id07, id12, id34, id56; + + int32_t irot_input_x, irot_input_y; + int32_t icommon_product1; /* Re-used product (c4s4 * (s12 - s56)). */ + int32_t icommon_product2; /* Re-used product (c4s4 * (d12 + d56)). */ + + int32_t temp1, temp2; /* intermediate variable for computation */ + + int32_t InterData[64]; + int32_t *ip = InterData; + int16_t * op = dest; + for (loop = 0; loop < 8; loop++){ + /* Pre calculate some common sums and differences. */ + is07 = src[0] + src[7]; + is12 = src[1] + src[2]; + is34 = src[3] + src[4]; + is56 = src[5] + src[6]; + + id07 = src[0] - src[7]; + id12 = src[1] - src[2]; + id34 = src[3] - src[4]; + id56 = src[5] - src[6]; + + is0734 = is07 + is34; + is1256 = is12 + is56; + + /* Pre-Calculate some common product terms. */ + icommon_product1 = xC4S4*(is12 - is56); + icommon_product1 = DOROUND(icommon_product1); + icommon_product1>>=16; + + icommon_product2 = xC4S4*(id12 + id56); + icommon_product2 = DOROUND(icommon_product2); + icommon_product2>>=16; + + + ip[0] = (xC4S4*(is0734 + is1256)); + ip[0] = DOROUND(ip[0]); + ip[0] >>= 16; + + ip[4] = (xC4S4*(is0734 - is1256)); + ip[4] = DOROUND(ip[4]); + ip[4] >>= 16; + + /* Define inputs to rotation for outputs 2 and 6 */ + irot_input_x = id12 - id56; + irot_input_y = is07 - is34; + + /* Apply rotation for outputs 2 and 6. */ + temp1=xC6S2*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC2S6*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + ip[2] = temp1 + temp2; + + temp1=xC6S2*irot_input_y; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC2S6*irot_input_x ; + temp2=DOROUND(temp2); + temp2>>=16; + ip[6] = temp1 -temp2 ; + + /* Define inputs to rotation for outputs 1 and 7 */ + irot_input_x = icommon_product1 + id07; + irot_input_y = -( id34 + icommon_product2 ); + + /* Apply rotation for outputs 1 and 7. */ + + temp1=xC1S7*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC7S1*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + ip[1] = temp1 - temp2; + + temp1=xC7S1*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC1S7*irot_input_y ; + temp2=DOROUND(temp2); + temp2>>=16; + ip[7] = temp1 + temp2 ; + + /* Define inputs to rotation for outputs 3 and 5 */ + irot_input_x = id07 - icommon_product1; + irot_input_y = id34 - icommon_product2; + + /* Apply rotation for outputs 3 and 5. */ + temp1=xC3S5*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC5S3*irot_input_y ; + temp2=DOROUND(temp2); + temp2>>=16; + ip[3] = temp1 - temp2 ; + + temp1=xC5S3*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC3S5*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + ip[5] = temp1 + temp2; + + /* Increment data pointer for next row. */ + src += 8 ; + ip += 8; /* advance pointer to next row */ + + } + + + /* Performed DCT on rows, now transform the columns */ + ip = InterData; + for (loop = 0; loop < 8; loop++){ + /* Pre calculate some common sums and differences. */ + is07 = ip[0 * 8] + ip[7 * 8]; + is12 = ip[1 * 8] + ip[2 * 8]; + is34 = ip[3 * 8] + ip[4 * 8]; + is56 = ip[5 * 8] + ip[6 * 8]; + + id07 = ip[0 * 8] - ip[7 * 8]; + id12 = ip[1 * 8] - ip[2 * 8]; + id34 = ip[3 * 8] - ip[4 * 8]; + id56 = ip[5 * 8] - ip[6 * 8]; + + is0734 = is07 + is34; + is1256 = is12 + is56; + + /* Pre-Calculate some common product terms. */ + icommon_product1 = xC4S4*(is12 - is56) ; + icommon_product2 = xC4S4*(id12 + id56) ; + icommon_product1 = DOROUND(icommon_product1); + icommon_product2 = DOROUND(icommon_product2); + icommon_product1>>=16; + icommon_product2>>=16; + + + temp1 = xC4S4*(is0734 + is1256) ; + temp2 = xC4S4*(is0734 - is1256) ; + temp1 = DOROUND(temp1); + temp2 = DOROUND(temp2); + temp1>>=16; + temp2>>=16; + op[0*8] = (int16_t) temp1; + op[4*8] = (int16_t) temp2; + + /* Define inputs to rotation for outputs 2 and 6 */ + irot_input_x = id12 - id56; + irot_input_y = is07 - is34; + + /* Apply rotation for outputs 2 and 6. */ + temp1=xC6S2*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC2S6*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + op[2*8] = (int16_t) (temp1 + temp2); + + temp1=xC6S2*irot_input_y; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC2S6*irot_input_x ; + temp2=DOROUND(temp2); + temp2>>=16; + op[6*8] = (int16_t) (temp1 -temp2) ; + + /* Define inputs to rotation for outputs 1 and 7 */ + irot_input_x = icommon_product1 + id07; + irot_input_y = -( id34 + icommon_product2 ); + + /* Apply rotation for outputs 1 and 7. */ + temp1=xC1S7*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC7S1*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + op[1*8] = (int16_t) (temp1 - temp2); + + temp1=xC7S1*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC1S7*irot_input_y ; + temp2=DOROUND(temp2); + temp2>>=16; + op[7*8] = (int16_t) (temp1 + temp2); + + /* Define inputs to rotation for outputs 3 and 5 */ + irot_input_x = id07 - icommon_product1; + irot_input_y = id34 - icommon_product2; + + /* Apply rotation for outputs 3 and 5. */ + temp1=xC3S5*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC5S3*irot_input_y ; + temp2=DOROUND(temp2); + temp2>>=16; + op[3*8] = (int16_t) (temp1 - temp2) ; + + temp1=xC5S3*irot_input_x; + temp1=DOROUND(temp1); + temp1>>=16; + temp2=xC3S5*irot_input_y; + temp2=DOROUND(temp2); + temp2>>=16; + op[5*8] = (int16_t) (temp1 + temp2); + + /* Increment data pointer for next column. */ + ip ++; + op ++; + } +} + +OIL_DEFINE_IMPL_REF (fdct8x8theora_ref, fdct8x8theora); + diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c new file mode 100644 index 0000000..6126adb --- /dev/null +++ b/liboil/dct/fdct8x8theora_i386.c @@ -0,0 +1,357 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*========================================================================== + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY + * KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR + * PURPOSE. + * + * Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved. + * + *--------------------------------------------------------------------------*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilfuncs.h> +#include <liboil/dct/dct.h> +#include <math.h> + +static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL; +static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL; +static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL; +static const __attribute__ ((aligned(8),used)) int64_t xC4S4 = 0x0b505b505b505b505LL; +static const __attribute__ ((aligned(8),used)) int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL; +static const __attribute__ ((aligned(8),used)) int64_t xC6S2 = 0x061f861f861f861f8LL; +static const __attribute__ ((aligned(8),used)) int64_t xC7S1 = 0x031f131f131f131f1LL; + +#if defined(__MINGW32__) || defined(__CYGWIN__) || \ + defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +OIL_DECLARE_CLASS(fdct8x8theora); + +/* execute stage 1 of forward DCT */ +#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \ + " movq " #ip0 ", %%mm0 \n\t" \ + " movq " #ip1 ", %%mm1 \n\t" \ + " movq " #ip3 ", %%mm2 \n\t" \ + " movq " #ip5 ", %%mm3 \n\t" \ + " movq %%mm0, %%mm4 \n\t" \ + " movq %%mm1, %%mm5 \n\t" \ + " movq %%mm2, %%mm6 \n\t" \ + " movq %%mm3, %%mm7 \n\t" \ + \ + " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \ + " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \ + " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \ + " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \ + " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \ + " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \ + \ + " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \ + \ + " paddsw %%mm2, %%mm2 \n\t" \ + \ + " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \ + \ + " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \ + " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \ + " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \ + " paddsw %%mm3, %%mm3 \n\t" \ + " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \ + \ + " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \ + /* ------------------------------------------------------------------- */ \ + " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \ + " paddsw %%mm7, %%mm7 \n\t" \ + " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \ + /* ------------------------------------------------------------------- */ \ + " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \ + " paddsw %%mm3, %%mm3 \n\t" \ + \ + " movq %%mm2, %%mm0 \n\t" /* make a copy */ \ + " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \ + \ + " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \ + " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \ + \ + " movq %%mm3, %%mm2 \n\t" \ + " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \ + \ + " movq %%mm3, %%mm0 \n\t" \ + " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \ + " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \ + \ + " movq %%mm3," #ip0 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \ + " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \ + \ + " movq " #temp ", %%mm2 \n\t" \ + " movq %%mm2, %%mm0 \n\t" \ + \ + " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \ + " paddw %%mm0, %%mm3 \n\t" \ + \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + " movq %%mm5, %%mm0 \n\t" \ + \ + " movq %%mm5, %%mm2 \n\t" \ + " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ + \ + " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \ + " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \ + \ + " movq %%mm5, %%mm0 \n\t" \ + " movq %%mm5, %%mm2 \n\t" \ + \ + " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " movq " #temp ", %%mm3 \n\t" \ + " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \ + \ + " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ + " movq %%mm3, %%mm2 \n\t" \ + \ + " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + " psubsw %%mm5, %%mm3 \n\t" \ + \ + " movq %%mm3," #ip6 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC4S4)", %%mm0 \n\t" \ + " movq %%mm1, %%mm2 \n\t" \ + " movq %%mm1, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \ + " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \ + \ + " movq %%mm7, %%mm2 \n\t" \ + " movq %%mm7, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \ + " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \ + /* ------------------------------------------------------------------- */ \ + " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \ + " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \ + \ + " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \ + " paddsw %%mm6, %%mm6 \n\t" \ + " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \ + \ + " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \ + " paddsw %%mm1, %%mm1 \n\t" \ + " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC1S7)", %%mm7 \n\t" \ + " movq %%mm1, %%mm2 \n\t" \ + \ + " movq %%mm1, %%mm3 \n\t" \ + " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \ + \ + " movq "M(xC7S1)", %%mm7 \n\t" \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \ + " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \ + \ + " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + \ + " movq %%mm0, %%mm5 \n\t" \ + " movq %%mm0, %%mm2 \n\t" \ + \ + " movq "M(xC1S7)", %%mm7 \n\t" \ + " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \ + \ + " movq "M(xC7S1)", %%mm7 \n\t" \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \ + " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ + \ + " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \ + " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ + \ + " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \ + " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \ + \ + " movq %%mm1," #ip1 " \n\t" \ + " movq %%mm3," #ip7 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC3S5)", %%mm0 \n\t" \ + " movq "M(xC5S3)", %%mm1 \n\t" \ + \ + " movq %%mm6, %%mm5 \n\t" \ + " movq %%mm6, %%mm7 \n\t" \ + \ + " movq %%mm4, %%mm2 \n\t" \ + " movq %%mm4, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \ + " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " psrlw $15, %%mm5 \n\t" \ + \ + " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \ + " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \ + \ + " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \ + " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \ + \ + " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \ + " movq %%mm4," #ip3 " \n\t" \ + \ + " movq %%mm3, %%mm4 \n\t" \ + " movq %%mm7, %%mm6 \n\t" \ + \ + " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \ + " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \ + \ + " paddw %%mm2, %%mm4 \n\t" \ + " paddw %%mm5, %%mm6 \n\t" \ + \ + " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \ + " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \ + \ + " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \ + " movq %%mm3," #ip5 " \n\t" + +#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \ + op0,op1,op2,op3,op4,op5,op6,op7) \ + " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \ + " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \ + " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \ + " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \ + " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \ + " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \ + " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \ + " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \ + " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \ + /* Transpose 2x8 block */ \ + " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \ + " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \ + " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \ + " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \ + " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \ + " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \ + " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \ + " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \ + " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \ + " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \ + " movq %%mm4," #op4 " \n\t" \ + " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \ + " movq %%mm5," #op5 " \n\t" \ + " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \ + " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \ + " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \ + " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \ + " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \ + " movq %%mm6," #op7 " \n\t" \ + " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \ + " movq %%mm1," #op6 " \n\t" \ + " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \ + " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \ + " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \ + " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \ + " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \ + " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \ + " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \ + " movq %%mm0," #op0 " \n\t" \ + " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \ + " movq %%mm1," #op1 " \n\t" \ + " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \ + " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \ + " movq %%mm4," #op3 " \n\t" \ + " movq %%mm2," #op2 " \n\t" + + +static void +fdct8x8theora_mmx(int16_t *src, int16_t *dest) +{ + int64_t __attribute__((aligned(8))) align_tmp[16]; + int16_t *const temp= (int16_t*)align_tmp; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + /* + * Input data is an 8x8 block. To make processing of the data more efficent + * we will transpose the block of data to two 4x8 blocks??? + */ + Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0), + (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1)) + Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2)) + + Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0), + 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1)) + Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) + + Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), + 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1)) + Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2)) + + Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), + 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1)) + Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) + + " emms \n\t" + + : "+r" (src), + "+r" (dest) + : "r" (temp) + : "memory" + ); +} + +OIL_DEFINE_IMPL_FULL (fdct8x8theora_mmx, fdct8x8theora, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/i386/Makefile.am b/liboil/i386/Makefile.am new file mode 100644 index 0000000..1d66341 --- /dev/null +++ b/liboil/i386/Makefile.am @@ -0,0 +1,27 @@ + +noinst_LTLIBRARIES = libi386.la + +sources = \ + error8x8_i386.c \ + recon8x8_i386.c \ + rowcolsad8x8_i386.c \ + sad8x8_i386.c \ + sad8x8avg_i386.c \ + diff8x8_i386.c + +if HAVE_CPU_I386 +i386_sources = $(sources) +else +i386_sources = +endif + +if HAVE_CPU_AMD64 +amd64_sources = $(sources) +else +amd64_sources = +endif + +libi386_la_SOURCES = \ + $(i386_sources) +libi386_la_CFLAGS = $(LIBOIL_CFLAGS) + diff --git a/liboil/i386/diff8x8_i386.c b/liboil/i386/diff8x8_i386.c new file mode 100644 index 0000000..a0dc8ae --- /dev/null +++ b/liboil/i386/diff8x8_i386.c @@ -0,0 +1,169 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS (diff8x8_s16_u8); +OIL_DECLARE_CLASS (diff8x8_const128_s16_u8); +OIL_DECLARE_CLASS (diff8x8_average_s16_u8); + +static const __attribute__ ((aligned(8),used)) int64_t V128w = 0x0080008000800080LL; + +#ifdef HAVE_LD_UNDERSCORE +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +static void +diff8x8_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ + /* start calculation */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ + " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ + " movq %%mm0, (%2) \n\t" /* write answer out */ + " movq %%mm2, 8(%2) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %2 \n\t" + " add %3, %0 \n\t" + " add %4, %1 \n\t" + ".endr \n\t" + " emms \n\t" + + : "+r" (src1), + "+r" (src2), + "+r" (dest) + : "m" (ss1), + "m" (ss2) + : "memory" + ); +} +OIL_DEFINE_IMPL_FULL (diff8x8_s16_u8_mmx, diff8x8_s16_u8, OIL_IMPL_FLAG_MMX); + +static void +diff8x8_const128_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + " movq "M(V128w)", %%mm1 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + /* start calculation */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ + " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ + " movq %%mm0, (%1) \n\t" /* write answer out */ + " movq %%mm2, 8(%1) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %1 \n\t" + " add %2, %0 \n\t" + ".endr \n\t" + " emms \n\t" + + : "+r" (src1), + "+r" (dest) + : "r" (ss1) + : "memory" + ); +} +OIL_DEFINE_IMPL_FULL (diff8x8_const128_s16_u8_mmx, diff8x8_const128_s16_u8, OIL_IMPL_FLAG_MMX); + +static void +diff8x8_average_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2, uint8_t *src3) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ + " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ + " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ + " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ + " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ + /* average ReconPtr1 and ReconPtr2 */ + " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ + " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ + " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ + " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ + " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ + " movq %%mm0, (%3) \n\t" /* write answer out */ + " movq %%mm2, 8(%3) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %3 \n\t" + " add %4, %0 \n\t" + " add %5, %1 \n\t" + " add %5, %2 \n\t" + ".endr \n\t" + " emms \n\t" + + : "+r" (src1), + "+r" (src2), + "+r" (src3), + "+r" (dest) + : "m" (ss1), + "m" (ss2) + : "memory" + ); +} +OIL_DEFINE_IMPL_FULL (diff8x8_average_s16_u8_mmx, diff8x8_average_s16_u8, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/i386/error8x8_i386.c b/liboil/i386/error8x8_i386.c new file mode 100644 index 0000000..079604a --- /dev/null +++ b/liboil/i386/error8x8_i386.c @@ -0,0 +1,337 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS (err_intra8x8_u8); +OIL_DECLARE_CLASS (err_inter8x8_u8); +OIL_DECLARE_CLASS (err_inter8x8_u8_avg); + +static void +err_intra8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1) +{ + uint32_t xsum; + uint32_t xxsum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + " movq %%mm0, %%mm2 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %3, %2 \n\t" /* Inc pointer into src data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + " emms \n\t" + + : "=r" (xsum), + "=r" (xxsum), + "+r" (src1) + : "r" (ss1) + : "edi", "memory" + ); + + /* Compute population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} +OIL_DEFINE_IMPL_FULL (err_intra8x8_u8_mmx, err_intra8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +err_inter8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + uint32_t xsum; + uint32_t xxsum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + " movq (%3), %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm6, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm6, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %4, %2 \n\t" /* Inc pointer into src data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + " emms \n\t" + + : "=m" (xsum), + "=m" (xxsum), + "+r" (src1), + "+r" (src2) + : "m" (ss1), + "m" (ss2) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_mmx, err_inter8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +err_inter8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + uint32_t xsum; + uint32_t xxsum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */ + " paddb %%mm4, %%mm4 \n\t" + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%3), %%mm2 \n\t" + " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */ + " movq %%mm2, %%mm1 \n\t" + " pand %%mm3, %%mm1 \n\t" + " pxor %%mm2, %%mm3 \n\t" + " pand %%mm4, %%mm3 \n\t" + " psrlq $1, %%mm3 \n\t" + " paddb %%mm3, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm6, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm6, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %5, %2 \n\t" /* Inc pointer into src data */ + " add %6, %3 \n\t" /* Inc pointer into ref data */ + " add %6, %4 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + " emms \n\t" + + : "=m" (xsum), + "=m" (xxsum), + "+r" (src1), + "+r" (src2), + "+r" (src3) + : "m" (ss1), + "m" (ss2) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} + +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX); + +static void +err_inter8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + uint32_t xsum; + uint32_t xxsum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm4, %%mm4 \n\t" + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%3), %%mm2 \n\t" + " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */ + " pavgb %%mm2, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm4, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm4, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %5, %2 \n\t" /* Inc pointer into src data */ + " add %6, %3 \n\t" /* Inc pointer into ref data */ + " add %6, %4 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + " emms \n\t" + + : "=m" (xsum), + "=m" (xxsum), + "+r" (src1), + "+r" (src2), + "+r" (src3) + : "m" (ss1), + "m" (ss2) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} + +OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + diff --git a/liboil/i386/recon8x8_i386.c b/liboil/i386/recon8x8_i386.c new file mode 100644 index 0000000..91df0d6 --- /dev/null +++ b/liboil/i386/recon8x8_i386.c @@ -0,0 +1,165 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/simdpack/simdpack.h> + +OIL_DECLARE_CLASS (recon8x8_intra); +OIL_DECLARE_CLASS (recon8x8_inter); +OIL_DECLARE_CLASS (recon8x8_inter2); + +static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL; + +#ifdef HAVE_LD_UNDERSCORE +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +static void +recon8x8_intra_mmx (uint8_t *dest, int ds, int16_t *change) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */ + + " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */ + "1: \n\t" + " movq (%1), %%mm2 \n\t" /* First four input values */ + + " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */ + " por %%mm0, %%mm0 \n\t" + " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */ + " lea 16(%1), %1 \n\t" /* Step source buffer */ + " cmp %%edi, %1 \n\t" /* are we done */ + + " movq %%mm2, (%0) \n\t" /* store results */ + + " lea (%0, %2), %0 \n\t" /* Step output buffer */ + " jc 1b \n\t" /* Loop back if we are not done */ + " emms \n\t" + : "+r" (dest) + : "r" (change), + "r" (ds) + : "memory", "edi" + ); +} + +OIL_DEFINE_IMPL_FULL (recon8x8_intra_mmx, recon8x8_intra, OIL_IMPL_FLAG_MMX); + +static void +recon8x8_inter_mmx (uint8_t *dest, int ds, uint8_t *src, int ss, int16_t *change) +{ + /* FIXME doesn't handle ss */ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm0, %%mm0 \n\t" + " lea 128(%1), %%edi \n\t" + + "1: \n\t" + " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ + + " movq (%1), %%mm4 \n\t" /* first 4 changes */ + " movq %%mm2, %%mm3 \n\t" + " movq 8(%1), %%mm5 \n\t" /* last 4 changes */ + " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */ + " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */ + " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */ + " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */ + " add %3, %2 \n\t" /* next row of reference pixels */ + " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */ + " lea 16(%1), %1 \n\t" /* next row of changes */ + " cmp %%edi, %1 \n\t" /* are we done? */ + + " movq %%mm2, (%0) \n\t" /* store result */ + + " lea (%0, %3), %0 \n\t" /* next row of output */ + " jc 1b \n\t" + " emms \n\t" + : "+r" (dest) + : "r" (change), + "r" (src), + "r" (ds) + : "memory", "edi" + ); +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter_mmx, recon8x8_inter, OIL_IMPL_FLAG_MMX); + +static void +recon8x8_inter2_mmx (uint8_t *dest, int ds, uint8_t *s1, int ss1, uint8_t *s2, int ss2, int16_t *change) +{ + /* FIXME doesn't handle ss1, ss2 */ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm0, %%mm0 \n\t" + " lea 128(%1), %%edi \n\t" + + "1: \n\t" + " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ + " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */ + + " movq %%mm2, %%mm3 \n\t" + " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */ + " movq %%mm4, %%mm5 \n\t" + " movq (%1), %%mm6 \n\t" /* first 4 changes */ + " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */ + " movq 8(%1), %%mm7 \n\t" /* last 4 changes */ + " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */ + " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */ + " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */ + " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */ + " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */ + " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */ + " paddw %%mm6, %%mm2 \n\t" /* add changes to start */ + " paddw %%mm7, %%mm3 \n\t" /* add changes to end */ + " lea 16(%1), %1 \n\t" /* next row of changes */ + " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */ + " add %4, %2 \n\t" /* next row of reference pixels */ + " add %4, %3 \n\t" /* next row of reference pixels */ + " movq %%mm2, (%0) \n\t" /* store result */ + " add %4, %0 \n\t" /* next row of output */ + " cmp %%edi, %1 \n\t" /* are we done? */ + " jc 1b \n\t" + " emms \n\t" + : "+r" (dest) + : "r" (change), + "r" (s1), + "r" (s2), + "m" (ds) + : "memory", "edi" + ); +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter2_mmx, recon8x8_inter2, OIL_IMPL_FLAG_MMX); diff --git a/liboil/i386/rowcolsad8x8_i386.c b/liboil/i386/rowcolsad8x8_i386.c new file mode 100644 index 0000000..b05b8a5 --- /dev/null +++ b/liboil/i386/rowcolsad8x8_i386.c @@ -0,0 +1,280 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS (rowsad8x8_u8); +OIL_DECLARE_CLASS (colsad8x8_u8); + +static void +rowsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, uint8_t *src2) +{ + uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */ + " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */ + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + " psrlq $32, %%mm2 \n\t" /* fold and add */ + " psrlq $32, %%mm3 \n\t" + " paddw %%mm2, %%mm0 \n\t" + " paddw %%mm3, %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + " psrlq $16, %%mm2 \n\t" + " psrlq $16, %%mm3 \n\t" + " paddw %%mm2, %%mm0 \n\t" + " paddw %%mm3, %%mm1 \n\t" + + " psubusw %%mm0, %%mm1 \n\t" + " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */ + " movd %%mm1, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=m" (MaxSad), + "+r" (src1), + "+r" (src2) + : + : "memory" + ); + *dest = MaxSad; +} +OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmx, rowsad8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +rowsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, uint8_t *src2) +{ + uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " movd (%1), %%mm0 \n\t" + " movd (%2), %%mm1 \n\t" + " psadbw %%mm0, %%mm1 \n\t" + " movd 4(%1), %%mm2 \n\t" + " movd 4(%2), %%mm3 \n\t" + " psadbw %%mm2, %%mm3 \n\t" + + " pmaxsw %%mm1, %%mm3 \n\t" + " movd %%mm3, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=m" (MaxSad), + "+r" (src1), + "+r" (src2) + : + : "memory" + ); + *dest = MaxSad; +} +OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmxext, rowsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +colsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ + " mov $4, %%edi \n\t" /* 4 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " mov $4, %%edi \n\t" /* 4 rows */ + "2: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 2b \n\t" + + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */ + " psubusw %%mm4, %%mm5 \n\t" + " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */ + " psubusw %%mm5, %%mm7 \n\t" + " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movq %%mm7, %%mm6 \n\t" + " psrlq $32, %%mm6 \n\t" + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movq %%mm7, %%mm6 \n\t" + " psrlq $16, %%mm6 \n\t" + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=r" (MaxSad), + "+r" (src1), + "+r" (src2) + : "r" (ss1) + : "memory", "edi" + ); + *dest = MaxSad; +} +OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmx, colsad8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +colsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ + " mov $4, %%edi \n\t" /* 4 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " mov $4, %%edi \n\t" /* 4 rows */ + "2: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 2b \n\t" + + " pmaxsw %%mm6, %%mm7 \n\t" + " pmaxsw %%mm4, %%mm5 \n\t" + " pmaxsw %%mm5, %%mm7 \n\t" + " movq %%mm7, %%mm6 \n\t" + " psrlq $32, %%mm6 \n\t" + " pmaxsw %%mm6, %%mm7 \n\t" + " movq %%mm7, %%mm6 \n\t" + " psrlq $16, %%mm6 \n\t" + " pmaxsw %%mm6, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=r" (MaxSad), + "+r" (src1), + "+r" (src2) + : "r" (ss1) + : "memory", "edi" + ); + + *dest = MaxSad; +} +OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmxext, colsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + diff --git a/liboil/i386/sad8x8_i386.c b/liboil/i386/sad8x8_i386.c new file mode 100644 index 0000000..242aa8c --- /dev/null +++ b/liboil/i386/sad8x8_i386.c @@ -0,0 +1,120 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS (sad8x8_u8); + +static void +sad8x8_u8_mmx (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + ".rept 8 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq %%mm7, %%mm0 \n\t" + " psrlq $16, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=m" (diff), + "+r" (src1), + "+r" (src2) + : "r" (sstr1), + "r" (sstr2) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmx, sad8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +sad8x8_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + + ".rept 7 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" (sstr1), + "r" (sstr2) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); diff --git a/liboil/i386/sad8x8avg_i386.c b/liboil/i386/sad8x8avg_i386.c new file mode 100644 index 0000000..0bae5f8 --- /dev/null +++ b/liboil/i386/sad8x8avg_i386.c @@ -0,0 +1,136 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DECLARE_CLASS (sad8x8_u8_avg); + +static void +sad8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */ + " paddb %%mm5, %%mm5 \n\t" + + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + " mov $8, %%edi \n\t" /* 8 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%2), %%mm2 \n\t" + " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */ + " movq %%mm2, %%mm1 \n\t" + " pand %%mm3, %%mm1 \n\t" + " pxor %%mm2, %%mm3 \n\t" + " pand %%mm5, %%mm3 \n\t" + " psrlq $1, %%mm3 \n\t" + " paddb %%mm3, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " add %4, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %5, %2 \n\t" /* Inc pointer into ref data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq %%mm7, %%mm0 \n\t" + " psrlq $16, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=m" (diff), + "+r" (src1), + "+r" (src2), + "+r" (src3) + : "m" (ss1), + "m" (ss2) + : "edi", "memory" + ); + *dest = diff; +} + +OIL_DEFINE_IMPL_FULL (sad8x8_u8_avg_mmx, sad8x8_u8_avg, OIL_IMPL_FLAG_MMX); + +static void +sad8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + ".rept 8 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " movq (%3), %%mm2 \n\t" + " pavgb %%mm2, %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + + " add %4, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " add %5, %2 \n\t" /* Inc pointer into ref data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movd %%mm7, %0 \n\t" + " emms \n\t" + : "=m" (diff), + "+r" (src1), + "+r" (src2), + "+r" (src3) + : "m" (ss1), + "m" (ss2) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_avg_mmxext, sad8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); diff --git a/liboil/ref/Makefile.am b/liboil/ref/Makefile.am new file mode 100644 index 0000000..3a4a788 --- /dev/null +++ b/liboil/ref/Makefile.am @@ -0,0 +1,26 @@ + +if USE_ALT_OPT +opt_libs = libref_opt1.la +else +opt_libs = +endif + +noinst_LTLIBRARIES = libref.la $(opt_libs) + +c_sources = \ + diff8x8.c \ + error8x8.c \ + recon8x8.c \ + rowcolsad8x8.c \ + sad8x8avg.c + +libref_la_SOURCES = \ + $(c_sources) +libref_la_LIBADD = \ + $(opt_libs) +libref_la_CFLAGS = $(LIBOIL_CFLAGS) + +libref_opt1_la_SOURCES = $(c_sources) +libref_opt1_la_CFLAGS = $(LIBOIL_CFLAGS) \ + $(LIBOIL_OPT_CFLAGS) + diff --git a/liboil/ref/diff8x8.c b/liboil/ref/diff8x8.c new file mode 100644 index 0000000..ee518dc --- /dev/null +++ b/liboil/ref/diff8x8.c @@ -0,0 +1,117 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + +OIL_DEFINE_CLASS (diff8x8_s16_u8, + "int16_t *d_64, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2"); +OIL_DEFINE_CLASS (diff8x8_const128_s16_u8, + "int16_t *d_64, uint8_t *s1_8x8, int ss1"); +OIL_DEFINE_CLASS (diff8x8_average_s16_u8, + "int16_t *d_64, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2, uint8_t *s3_8x8, int ss3"); + +static void +diff8x8_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + int i; + + /* For each block row */ + for (i=0;i<8;i++ ){ + dest[0] = ((int16_t)src1[0]) - ((int16_t)src2[0]); + dest[1] = ((int16_t)src1[1]) - ((int16_t)src2[1]); + dest[2] = ((int16_t)src1[2]) - ((int16_t)src2[2]); + dest[3] = ((int16_t)src1[3]) - ((int16_t)src2[3]); + dest[4] = ((int16_t)src1[4]) - ((int16_t)src2[4]); + dest[5] = ((int16_t)src1[5]) - ((int16_t)src2[5]); + dest[6] = ((int16_t)src1[6]) - ((int16_t)src2[6]); + dest[7] = ((int16_t)src1[7]) - ((int16_t)src2[7]); + + /* Start next row */ + src1 += ss1; + src2 += ss2; + dest += 8; + } +} +OIL_DEFINE_IMPL_REF (diff8x8_s16_u8_ref, diff8x8_s16_u8); + +static void +diff8x8_const128_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1) +{ + int i; + + /* For each block row */ + for (i=0;i<8;i++ ){ + dest[0] = ((int16_t)src1[0]) - 128; + dest[1] = ((int16_t)src1[1]) - 128; + dest[2] = ((int16_t)src1[2]) - 128; + dest[3] = ((int16_t)src1[3]) - 128; + dest[4] = ((int16_t)src1[4]) - 128; + dest[5] = ((int16_t)src1[5]) - 128; + dest[6] = ((int16_t)src1[6]) - 128; + dest[7] = ((int16_t)src1[7]) - 128; + + /* Start next row */ + src1 += ss1; + dest += 8; + } +} +OIL_DEFINE_IMPL_REF (diff8x8_const128_s16_u8_ref, diff8x8_const128_s16_u8); + +static void +diff8x8_average_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2, uint8_t *src3, int ss3) +{ + int i; + + /* For each block row */ + for (i=0;i<8;i++ ){ + dest[0] = ((int16_t)src1[0]) - ((((int16_t)src2[0]) + ((int16_t)src3[0])) / 2); + dest[1] = ((int16_t)src1[1]) - ((((int16_t)src2[1]) + ((int16_t)src3[1])) / 2); + dest[2] = ((int16_t)src1[2]) - ((((int16_t)src2[2]) + ((int16_t)src3[2])) / 2); + dest[3] = ((int16_t)src1[3]) - ((((int16_t)src2[3]) + ((int16_t)src3[3])) / 2); + dest[4] = ((int16_t)src1[4]) - ((((int16_t)src2[4]) + ((int16_t)src3[4])) / 2); + dest[5] = ((int16_t)src1[5]) - ((((int16_t)src2[5]) + ((int16_t)src3[5])) / 2); + dest[6] = ((int16_t)src1[6]) - ((((int16_t)src2[6]) + ((int16_t)src3[6])) / 2); + dest[7] = ((int16_t)src1[7]) - ((((int16_t)src2[7]) + ((int16_t)src3[7])) / 2); + + /* Start next row */ + src1 += ss1; + src2 += ss2; + src3 += ss3; + dest += 8; + } +} +OIL_DEFINE_IMPL_REF (diff8x8_average_s16_u8_ref, diff8x8_average_s16_u8); + + + + + + diff --git a/liboil/ref/error8x8.c b/liboil/ref/error8x8.c new file mode 100644 index 0000000..825d335 --- /dev/null +++ b/liboil/ref/error8x8.c @@ -0,0 +1,181 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> + + +OIL_DEFINE_CLASS (err_intra8x8_u8, + "uint32_t *d_1, uint8_t *s1_8x8, int ss1"); +OIL_DEFINE_CLASS (err_inter8x8_u8, + "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2"); +OIL_DEFINE_CLASS (err_inter8x8_u8_avg, + "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, uint8_t *s3_8x8, int ss2"); + +#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) +#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) + +static void +err_intra8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1) +{ + uint32_t i; + uint32_t xsum=0; + uint32_t xxsum=0; + + for (i=8; i; i--) { + /* Examine alternate pixel locations. */ + xsum += src1[0]; + xxsum += src1[0]*src1[0]; + xsum += src1[1]; + xxsum += src1[1]*src1[1]; + xsum += src1[2]; + xxsum += src1[2]*src1[2]; + xsum += src1[3]; + xxsum += src1[3]*src1[3]; + xsum += src1[4]; + xxsum += src1[4]*src1[4]; + xsum += src1[5]; + xxsum += src1[5]*src1[5]; + xsum += src1[6]; + xxsum += src1[6]*src1[6]; + xsum += src1[7]; + xxsum += src1[7]*src1[7]; + + /* Step to next row of block. */ + src1 += ss1; + } + /* Compute population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum )); +} +OIL_DEFINE_IMPL_REF (err_intra8x8_u8_ref, err_intra8x8_u8); + +static void +err_inter8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + uint32_t i; + uint32_t xsum=0; + uint32_t xxsum=0; + int32_t diff; + + for (i=8; i; i--) { + diff = DSP_OP_DIFF (src1[0], src2[0]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[1], src2[1]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[2], src2[2]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[3], src2[3]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[4], src2[4]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[5], src2[5]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[6], src2[6]); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF (src1[7], src2[7]); + xsum += diff; + xxsum += diff*diff; + + /* Step to next row of block. */ + src1 += ss1; + src2 += ss2; + } + + /* Compute and return population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} +OIL_DEFINE_IMPL_REF (err_inter8x8_u8_ref, err_inter8x8_u8); + +static void +err_inter8x8_u8_avg_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + uint32_t i; + uint32_t xsum=0; + uint32_t xxsum=0; + int32_t diff; + + for (i=8; i; i--) { + diff = DSP_OP_DIFF(src1[0], DSP_OP_AVG (src2[0], src3[0])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[1], DSP_OP_AVG (src2[1], src3[1])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[2], DSP_OP_AVG (src2[2], src3[2])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[3], DSP_OP_AVG (src2[3], src3[3])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[4], DSP_OP_AVG (src2[4], src3[4])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[5], DSP_OP_AVG (src2[5], src3[5])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[6], DSP_OP_AVG (src2[6], src3[6])); + xsum += diff; + xxsum += diff*diff; + + diff = DSP_OP_DIFF(src1[7], DSP_OP_AVG (src2[7], src3[7])); + xsum += diff; + xxsum += diff*diff; + + /* Step to next row of block. */ + src1 += ss1; + src2 += ss2; + src3 += ss2; + } + + /* Compute and return population variance as mis-match metric. */ + *dest = (((xxsum<<6) - xsum*xsum)); +} + +OIL_DEFINE_IMPL_REF (err_inter8x8_u8_avg_ref, err_inter8x8_u8_avg); diff --git a/liboil/ref/recon8x8.c b/liboil/ref/recon8x8.c new file mode 100644 index 0000000..0455198 --- /dev/null +++ b/liboil/ref/recon8x8.c @@ -0,0 +1,112 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/simdpack/simdpack.h> +#include <liboil/liboilcolorspace.h> + + +OIL_DEFINE_CLASS (recon8x8_intra, + "uint8_t *d_8x8, int ds, int16_t *s_8x8"); +OIL_DEFINE_CLASS (recon8x8_inter, + "uint8_t *d_8x8, int ds, uint8_t *s1_8x8, int ss1, int16_t *s2_8x8"); +OIL_DEFINE_CLASS (recon8x8_inter2, + "uint8_t *d_8x8, int ds, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2, int16_t *s3_8x8"); + + +static void +recon8x8_intra_ref (uint8_t *dest, int ds, int16_t *change) +{ + uint32_t i; + + for (i = 8; i; i--){ + dest[0] = oil_clamp_255(change[0] + 128); + dest[1] = oil_clamp_255(change[1] + 128); + dest[2] = oil_clamp_255(change[2] + 128); + dest[3] = oil_clamp_255(change[3] + 128); + dest[4] = oil_clamp_255(change[4] + 128); + dest[5] = oil_clamp_255(change[5] + 128); + dest[6] = oil_clamp_255(change[6] + 128); + dest[7] = oil_clamp_255(change[7] + 128); + + dest += ds; + change += 8; + } +} + +OIL_DEFINE_IMPL_REF (recon8x8_intra_ref, recon8x8_intra); + +static void +recon8x8_inter_ref (uint8_t *dest, int ds, uint8_t *src, int ss, int16_t *change, int dss) +{ + uint32_t i; + + for (i = 8; i; i--){ + dest[0] = oil_clamp_255(src[0] + change[0]); + dest[1] = oil_clamp_255(src[1] + change[1]); + dest[2] = oil_clamp_255(src[2] + change[2]); + dest[3] = oil_clamp_255(src[3] + change[3]); + dest[4] = oil_clamp_255(src[4] + change[4]); + dest[5] = oil_clamp_255(src[5] + change[5]); + dest[6] = oil_clamp_255(src[6] + change[6]); + dest[7] = oil_clamp_255(src[7] + change[7]); + + change += 8; + dest += ds; + src += ss; + } +} + +OIL_DEFINE_IMPL_REF (recon8x8_inter_ref, recon8x8_inter); + +static void +recon8x8_inter2_ref (uint8_t *dest, int ds, uint8_t *s1, int ss1, uint8_t *s2, int ss2, int16_t *change) +{ + uint32_t i; + + for (i = 8; i; i--){ + dest[0] = oil_clamp_255((((int16_t)s1[0] + (int16_t)s2[0]) >> 1) + change[0]); + dest[1] = oil_clamp_255((((int16_t)s1[1] + (int16_t)s2[1]) >> 1) + change[1]); + dest[2] = oil_clamp_255((((int16_t)s1[2] + (int16_t)s2[2]) >> 1) + change[2]); + dest[3] = oil_clamp_255((((int16_t)s1[3] + (int16_t)s2[3]) >> 1) + change[3]); + dest[4] = oil_clamp_255((((int16_t)s1[4] + (int16_t)s2[4]) >> 1) + change[4]); + dest[5] = oil_clamp_255((((int16_t)s1[5] + (int16_t)s2[5]) >> 1) + change[5]); + dest[6] = oil_clamp_255((((int16_t)s1[6] + (int16_t)s2[6]) >> 1) + change[6]); + dest[7] = oil_clamp_255((((int16_t)s1[7] + (int16_t)s2[7]) >> 1) + change[7]); + + change += 8; + dest += ds; + s1 += ss1; + s2 += ss2; + } +} + +OIL_DEFINE_IMPL_REF (recon8x8_inter2_ref, recon8x8_inter2); diff --git a/liboil/ref/rowcolsad8x8.c b/liboil/ref/rowcolsad8x8.c new file mode 100644 index 0000000..ecddded --- /dev/null +++ b/liboil/ref/rowcolsad8x8.c @@ -0,0 +1,110 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/simdpack/simdpack.h> +#include <math.h> + +#define ABS(x) ((x)>0 ? (x) : -(x)) +#define DSP_OP_ABS_DIFF(a,b) ABS((((int)(a)) - ((int)(b)))) + +OIL_DEFINE_CLASS (rowsad8x8_u8, + "uint32_t *d_1, uint8_t *s1_8x8, uint8_t *s2_8x8"); +OIL_DEFINE_CLASS (colsad8x8_u8, + "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2"); + +static void +rowsad8x8_u8_ref (uint32_t *dest, uint8_t *src1, uint8_t *src2) +{ + uint32_t SadValue; + uint32_t SadValue1; + + SadValue = DSP_OP_ABS_DIFF (src1[0], src2[0]) + + DSP_OP_ABS_DIFF (src1[1], src2[1]) + + DSP_OP_ABS_DIFF (src1[2], src2[2]) + + DSP_OP_ABS_DIFF (src1[3], src2[3]); + + SadValue1 = DSP_OP_ABS_DIFF (src1[4], src2[4]) + + DSP_OP_ABS_DIFF (src1[5], src2[5]) + + DSP_OP_ABS_DIFF (src1[6], src2[6]) + + DSP_OP_ABS_DIFF (src1[7], src2[7]); + + *dest = (SadValue > SadValue1) ? SadValue : SadValue1; +} +OIL_DEFINE_IMPL_REF (rowsad8x8_u8_ref, rowsad8x8_u8); + +static void +colsad8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) +{ + uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; + uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; + uint32_t MaxSad = 0; + uint32_t i; + + for ( i = 0; i < 4; i++ ){ + SadValue[0] += ABS(src1[0] - src2[0]); + SadValue[1] += ABS(src1[1] - src2[1]); + SadValue[2] += ABS(src1[2] - src2[2]); + SadValue[3] += ABS(src1[3] - src2[3]); + SadValue[4] += ABS(src1[4] - src2[4]); + SadValue[5] += ABS(src1[5] - src2[5]); + SadValue[6] += ABS(src1[6] - src2[6]); + SadValue[7] += ABS(src1[7] - src2[7]); + + src1 += ss1; + src2 += ss2; + } + + for ( i = 0; i < 4; i++ ){ + SadValue2[0] += ABS(src1[0] - src2[0]); + SadValue2[1] += ABS(src1[1] - src2[1]); + SadValue2[2] += ABS(src1[2] - src2[2]); + SadValue2[3] += ABS(src1[3] - src2[3]); + SadValue2[4] += ABS(src1[4] - src2[4]); + SadValue2[5] += ABS(src1[5] - src2[5]); + SadValue2[6] += ABS(src1[6] - src2[6]); + SadValue2[7] += ABS(src1[7] - src2[7]); + + src1 += ss1; + src2 += ss2; + } + + for ( i = 0; i < 8; i++ ){ + if ( SadValue[i] > MaxSad ) + MaxSad = SadValue[i]; + if ( SadValue2[i] > MaxSad ) + MaxSad = SadValue2[i]; + } + + *dest = MaxSad; +} +OIL_DEFINE_IMPL_REF (colsad8x8_u8_ref, colsad8x8_u8); + diff --git a/liboil/ref/sad8x8avg.c b/liboil/ref/sad8x8avg.c new file mode 100644 index 0000000..846547b --- /dev/null +++ b/liboil/ref/sad8x8avg.c @@ -0,0 +1,66 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <math.h> + +#define ABS(x) ((x)>0 ? (x) : -(x)) + +OIL_DEFINE_CLASS (sad8x8_u8_avg, + "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, uint8_t *s3_8x8, int ss2"); + +static void +sad8x8_u8_avg_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) +{ + int i; + uint32_t diff = 0; + + for (i=0; i<8;i++){ + diff += ABS(((int)src1[0]) - (((int)src2[0] + (int)src3[0]) / 2)); + diff += ABS(((int)src1[1]) - (((int)src2[1] + (int)src3[1]) / 2)); + diff += ABS(((int)src1[2]) - (((int)src2[2] + (int)src3[2]) / 2)); + diff += ABS(((int)src1[3]) - (((int)src2[3] + (int)src3[3]) / 2)); + diff += ABS(((int)src1[4]) - (((int)src2[4] + (int)src3[4]) / 2)); + diff += ABS(((int)src1[5]) - (((int)src2[5] + (int)src3[5]) / 2)); + diff += ABS(((int)src1[6]) - (((int)src2[6] + (int)src3[6]) / 2)); + diff += ABS(((int)src1[7]) - (((int)src2[7] + (int)src3[7]) / 2)); + + /* Step to next row of block. */ + src1 += ss1; + src2 += ss2; + src3 += ss2; + } + *dest = diff; +} + +OIL_DEFINE_IMPL_REF (sad8x8_u8_avg_ref, sad8x8_u8_avg); + + |