summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-08-03 03:31:18 +0000
committerDavid Schleef <ds@schleef.org>2005-08-03 03:31:18 +0000
commitc1f358f68635378f5bbdf54b066d9072e8c18a82 (patch)
tree682267b921428cdd052d2725c5082e77ad1099f3
parent8e630a5dfd9e57732fa1c09772846a3fcdf0adf5 (diff)
downloadliboil-c1f358f68635378f5bbdf54b066d9072e8c18a82.tar.gz
Patch from Wim Taymans adding a bunch of classes and MMX
implementations for libtheora. Heavily modified by ds. * Makefile.am: * liboil-uninstalled.pc.in: * liboil/copy/Makefile.am: * liboil/copy/copy.c: * liboil/copy/copy8x8.c: * liboil/copy/copy8x8_i386.c: * liboil/dct/Makefile.am: * liboil/dct/fdct8x8theora.c: * liboil/dct/fdct8x8theora_i386.c: * liboil/i386/Makefile.am: * liboil/i386/diff8x8_i386.c: * liboil/i386/error8x8_i386.c: * liboil/i386/recon8x8_i386.c: * liboil/i386/rowcolsad8x8_i386.c: * liboil/i386/sad8x8_i386.c: * liboil/i386/sad8x8avg_i386.c: * liboil/ref/Makefile.am: * liboil/ref/diff8x8.c: * liboil/ref/error8x8.c: * liboil/ref/recon8x8.c: * liboil/ref/rowcolsad8x8.c: * liboil/ref/sad8x8avg.c:
-rw-r--r--ChangeLog27
-rw-r--r--Makefile.am7
-rw-r--r--liboil-uninstalled.pc.in10
-rw-r--r--liboil/copy/Makefile.am2
-rw-r--r--liboil/copy/copy.c1
-rw-r--r--liboil/copy/copy8x8.c63
-rw-r--r--liboil/copy/copy8x8_i386.c76
-rw-r--r--liboil/dct/Makefile.am4
-rw-r--r--liboil/dct/fdct8x8theora.c294
-rw-r--r--liboil/dct/fdct8x8theora_i386.c357
-rw-r--r--liboil/i386/Makefile.am27
-rw-r--r--liboil/i386/diff8x8_i386.c169
-rw-r--r--liboil/i386/error8x8_i386.c337
-rw-r--r--liboil/i386/recon8x8_i386.c165
-rw-r--r--liboil/i386/rowcolsad8x8_i386.c280
-rw-r--r--liboil/i386/sad8x8_i386.c120
-rw-r--r--liboil/i386/sad8x8avg_i386.c136
-rw-r--r--liboil/ref/Makefile.am26
-rw-r--r--liboil/ref/diff8x8.c117
-rw-r--r--liboil/ref/error8x8.c181
-rw-r--r--liboil/ref/recon8x8.c112
-rw-r--r--liboil/ref/rowcolsad8x8.c110
-rw-r--r--liboil/ref/sad8x8avg.c66
23 files changed, 2684 insertions, 3 deletions
diff --git a/ChangeLog b/ChangeLog
index f445bf7..fab9be1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,32 @@
2005-08-02 David Schleef <ds@schleef.org>
+ Patch from Wim Taymans adding a bunch of classes and MMX
+ implementations for libtheora. Heavily modified by ds.
+ * Makefile.am:
+ * liboil-uninstalled.pc.in:
+ * liboil/copy/Makefile.am:
+ * liboil/copy/copy.c:
+ * liboil/copy/copy8x8.c:
+ * liboil/copy/copy8x8_i386.c:
+ * liboil/dct/Makefile.am:
+ * liboil/dct/fdct8x8theora.c:
+ * liboil/dct/fdct8x8theora_i386.c:
+ * liboil/i386/Makefile.am:
+ * liboil/i386/diff8x8_i386.c:
+ * liboil/i386/error8x8_i386.c:
+ * liboil/i386/recon8x8_i386.c:
+ * liboil/i386/rowcolsad8x8_i386.c:
+ * liboil/i386/sad8x8_i386.c:
+ * liboil/i386/sad8x8avg_i386.c:
+ * liboil/ref/Makefile.am:
+ * liboil/ref/diff8x8.c:
+ * liboil/ref/error8x8.c:
+ * liboil/ref/recon8x8.c:
+ * liboil/ref/rowcolsad8x8.c:
+ * liboil/ref/sad8x8avg.c:
+
+2005-08-02 David Schleef <ds@schleef.org>
+
* liboil/Makefile.am: add libcolorspace.h
* liboil/build_marshal.c: (main): use oil_init_no_optimize() to
save us from horrible build problems (like what happened today)
diff --git a/Makefile.am b/Makefile.am
index 3d35d50..57e5932 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,7 +10,12 @@ pkgconfig_DATA = liboil-$(LIBOIL_MAJORMINOR).pc
liboil-$(LIBOIL_MAJORMINOR).pc: liboil.pc
cp liboil.pc liboil-$(LIBOIL_MAJORMINOR).pc
-CLEANFILES = liboil-$(LIBOIL_MAJORMINOR).pc
+liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc: liboil-uninstalled.pc
+ cp liboil-uninstalled.pc liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc
+
+BUILT_SOURCES=liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc
+
+CLEANFILES = liboil-$(LIBOIL_MAJORMINOR).pc liboil-$(LIBOIL_MAJORMINOR)-uninstalled.pc
ACLOCAL_FLAGS = -I m4
diff --git a/liboil-uninstalled.pc.in b/liboil-uninstalled.pc.in
new file mode 100644
index 0000000..de72faf
--- /dev/null
+++ b/liboil-uninstalled.pc.in
@@ -0,0 +1,10 @@
+prefix=
+exec_prefix=
+libdir=${pcfiledir}/liboil/
+includedir=${pcfiledir}/
+
+Name: liboil-@LIBOIL_MAJORMINOR@ uninstalled
+Description: Libaray of Optimized Inner Loops
+Version: @VERSION@
+Libs: -L${libdir} -loil-@LIBOIL_MAJORMINOR@ -lm
+Cflags: -I${includedir}
diff --git a/liboil/copy/Makefile.am b/liboil/copy/Makefile.am
index 73dd865..5905f98 100644
--- a/liboil/copy/Makefile.am
+++ b/liboil/copy/Makefile.am
@@ -18,6 +18,7 @@ endif
if HAVE_CPU_I386
i386_sources = \
copy_i386.c \
+ copy8x8_i386.c \
splat_i386.c \
trans8x8_i386.c
else
@@ -35,6 +36,7 @@ endif
c_sources = \
copy.c \
+ copy8x8.c \
permute.c \
splat_ref.c \
tablelookup_ref.c \
diff --git a/liboil/copy/copy.c b/liboil/copy/copy.c
index 26aea18..84295da 100644
--- a/liboil/copy/copy.c
+++ b/liboil/copy/copy.c
@@ -77,4 +77,3 @@ copy_u8_ints (uint8_t *dest, uint8_t *src, int n)
}
OIL_DEFINE_IMPL (copy_u8_ints, copy_u8);
-
diff --git a/liboil/copy/copy8x8.c b/liboil/copy/copy8x8.c
new file mode 100644
index 0000000..561132c
--- /dev/null
+++ b/liboil/copy/copy8x8.c
@@ -0,0 +1,63 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DEFINE_CLASS (copy8x8_u8, "uint8_t *d_8x8, int ds, uint8_t *s_8x8, int ss");
+
+static void
+copy8x8_u8_ref (uint8_t *d1, int ds, uint8_t *s1, int ss)
+{
+ int i,j;
+ for (i=0;i<8;i++){
+ for (j=0;j<8;j++){
+ d1[j] = s1[j];
+ }
+ d1 += ds;
+ s1 += ss;
+ }
+}
+OIL_DEFINE_IMPL_REF (copy8x8_u8_ref, copy8x8_u8);
+
+static void
+copy8x8_u8_ints (uint8_t *d1, int ds, uint8_t *s1, int ss)
+{
+ int j;
+ for (j=0;j<8;j++){
+ ((uint32_t*)d1)[0] = ((uint32_t*)s1)[0];
+ ((uint32_t*)d1)[1] = ((uint32_t*)s1)[1];
+
+ d1+=ds;
+ s1+=ss;
+ }
+}
+OIL_DEFINE_IMPL (copy8x8_u8_ints, copy8x8_u8);
+
diff --git a/liboil/copy/copy8x8_i386.c b/liboil/copy/copy8x8_i386.c
new file mode 100644
index 0000000..fd3dec9
--- /dev/null
+++ b/liboil/copy/copy8x8_i386.c
@@ -0,0 +1,76 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS(copy8x8_u8);
+
+static void
+copy8x8_u8_mmx (uint8_t *dest, int dstr, uint8_t *src, int sstr)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " lea (%2, %2, 2), %%edi \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%edi), %%mm3 \n\t"
+
+ " lea (%1, %2, 4), %1 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%edi) \n\t"
+
+ " lea (%0, %2, 4), %0 \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1, %2), %%mm1 \n\t"
+ " movq (%1, %2, 2), %%mm2 \n\t"
+ " movq (%1, %%edi), %%mm3 \n\t"
+
+ " movq %%mm0, (%0) \n\t"
+ " movq %%mm1, (%0, %2) \n\t"
+ " movq %%mm2, (%0, %2, 2) \n\t"
+ " movq %%mm3, (%0, %%edi) \n\t"
+ " emms \n\t"
+ : "+a" (dest)
+ : "c" (src),
+ "r" (sstr),
+ "r" (dstr)
+ : "memory", "edi"
+ );
+}
+OIL_DEFINE_IMPL_FULL (copy8x8_u8_mmx, copy8x8_u8, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am
index 9183724..af92bae 100644
--- a/liboil/dct/Makefile.am
+++ b/liboil/dct/Makefile.am
@@ -12,7 +12,8 @@ noinst_HEADERS = \
if HAVE_CPU_I386
i386_sources = \
- idct8x8_i386.c
+ idct8x8_i386.c \
+ fdct8x8theora_i386.c
else
i386_sources =
endif
@@ -30,6 +31,7 @@ c_sources = \
fdct8_f64.c \
fdct8x8_f64.c \
fdct8x8s_s16.c \
+ fdct8x8theora.c \
idct8_f64.c \
idct8x8_c.c \
imdct32_f32.c \
diff --git a/liboil/dct/fdct8x8theora.c b/liboil/dct/fdct8x8theora.c
new file mode 100644
index 0000000..b485525
--- /dev/null
+++ b/liboil/dct/fdct8x8theora.c
@@ -0,0 +1,294 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: fdct8x8theora.c,v 1.1 2005-08-03 03:31:18 ds Exp $
+
+ ********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilfuncs.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+static int32_t xC1S7 = 64277;
+static int32_t xC2S6 = 60547;
+static int32_t xC3S5 = 54491;
+static int32_t xC4S4 = 46341;
+static int32_t xC5S3 = 36410;
+static int32_t xC6S2 = 25080;
+static int32_t xC7S1 = 12785;
+
+#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
+#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
+
+OIL_DEFINE_CLASS(fdct8x8theora, "int16_t *s_8x8, int16_t *d_8x8");
+
+static void
+fdct8x8theora_ref(int16_t *src, int16_t *dest)
+{
+ int loop;
+
+ int32_t is07, is12, is34, is56;
+ int32_t is0734, is1256;
+ int32_t id07, id12, id34, id56;
+
+ int32_t irot_input_x, irot_input_y;
+ int32_t icommon_product1; /* Re-used product (c4s4 * (s12 - s56)). */
+ int32_t icommon_product2; /* Re-used product (c4s4 * (d12 + d56)). */
+
+ int32_t temp1, temp2; /* intermediate variable for computation */
+
+ int32_t InterData[64];
+ int32_t *ip = InterData;
+ int16_t * op = dest;
+ for (loop = 0; loop < 8; loop++){
+ /* Pre calculate some common sums and differences. */
+ is07 = src[0] + src[7];
+ is12 = src[1] + src[2];
+ is34 = src[3] + src[4];
+ is56 = src[5] + src[6];
+
+ id07 = src[0] - src[7];
+ id12 = src[1] - src[2];
+ id34 = src[3] - src[4];
+ id56 = src[5] - src[6];
+
+ is0734 = is07 + is34;
+ is1256 = is12 + is56;
+
+ /* Pre-Calculate some common product terms. */
+ icommon_product1 = xC4S4*(is12 - is56);
+ icommon_product1 = DOROUND(icommon_product1);
+ icommon_product1>>=16;
+
+ icommon_product2 = xC4S4*(id12 + id56);
+ icommon_product2 = DOROUND(icommon_product2);
+ icommon_product2>>=16;
+
+
+ ip[0] = (xC4S4*(is0734 + is1256));
+ ip[0] = DOROUND(ip[0]);
+ ip[0] >>= 16;
+
+ ip[4] = (xC4S4*(is0734 - is1256));
+ ip[4] = DOROUND(ip[4]);
+ ip[4] >>= 16;
+
+ /* Define inputs to rotation for outputs 2 and 6 */
+ irot_input_x = id12 - id56;
+ irot_input_y = is07 - is34;
+
+ /* Apply rotation for outputs 2 and 6. */
+ temp1=xC6S2*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC2S6*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[2] = temp1 + temp2;
+
+ temp1=xC6S2*irot_input_y;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC2S6*irot_input_x ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[6] = temp1 -temp2 ;
+
+ /* Define inputs to rotation for outputs 1 and 7 */
+ irot_input_x = icommon_product1 + id07;
+ irot_input_y = -( id34 + icommon_product2 );
+
+ /* Apply rotation for outputs 1 and 7. */
+
+ temp1=xC1S7*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC7S1*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[1] = temp1 - temp2;
+
+ temp1=xC7S1*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC1S7*irot_input_y ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[7] = temp1 + temp2 ;
+
+ /* Define inputs to rotation for outputs 3 and 5 */
+ irot_input_x = id07 - icommon_product1;
+ irot_input_y = id34 - icommon_product2;
+
+ /* Apply rotation for outputs 3 and 5. */
+ temp1=xC3S5*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC5S3*irot_input_y ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[3] = temp1 - temp2 ;
+
+ temp1=xC5S3*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC3S5*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ ip[5] = temp1 + temp2;
+
+ /* Increment data pointer for next row. */
+ src += 8 ;
+ ip += 8; /* advance pointer to next row */
+
+ }
+
+
+ /* Performed DCT on rows, now transform the columns */
+ ip = InterData;
+ for (loop = 0; loop < 8; loop++){
+ /* Pre calculate some common sums and differences. */
+ is07 = ip[0 * 8] + ip[7 * 8];
+ is12 = ip[1 * 8] + ip[2 * 8];
+ is34 = ip[3 * 8] + ip[4 * 8];
+ is56 = ip[5 * 8] + ip[6 * 8];
+
+ id07 = ip[0 * 8] - ip[7 * 8];
+ id12 = ip[1 * 8] - ip[2 * 8];
+ id34 = ip[3 * 8] - ip[4 * 8];
+ id56 = ip[5 * 8] - ip[6 * 8];
+
+ is0734 = is07 + is34;
+ is1256 = is12 + is56;
+
+ /* Pre-Calculate some common product terms. */
+ icommon_product1 = xC4S4*(is12 - is56) ;
+ icommon_product2 = xC4S4*(id12 + id56) ;
+ icommon_product1 = DOROUND(icommon_product1);
+ icommon_product2 = DOROUND(icommon_product2);
+ icommon_product1>>=16;
+ icommon_product2>>=16;
+
+
+ temp1 = xC4S4*(is0734 + is1256) ;
+ temp2 = xC4S4*(is0734 - is1256) ;
+ temp1 = DOROUND(temp1);
+ temp2 = DOROUND(temp2);
+ temp1>>=16;
+ temp2>>=16;
+ op[0*8] = (int16_t) temp1;
+ op[4*8] = (int16_t) temp2;
+
+ /* Define inputs to rotation for outputs 2 and 6 */
+ irot_input_x = id12 - id56;
+ irot_input_y = is07 - is34;
+
+ /* Apply rotation for outputs 2 and 6. */
+ temp1=xC6S2*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC2S6*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[2*8] = (int16_t) (temp1 + temp2);
+
+ temp1=xC6S2*irot_input_y;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC2S6*irot_input_x ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[6*8] = (int16_t) (temp1 -temp2) ;
+
+ /* Define inputs to rotation for outputs 1 and 7 */
+ irot_input_x = icommon_product1 + id07;
+ irot_input_y = -( id34 + icommon_product2 );
+
+ /* Apply rotation for outputs 1 and 7. */
+ temp1=xC1S7*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC7S1*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[1*8] = (int16_t) (temp1 - temp2);
+
+ temp1=xC7S1*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC1S7*irot_input_y ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[7*8] = (int16_t) (temp1 + temp2);
+
+ /* Define inputs to rotation for outputs 3 and 5 */
+ irot_input_x = id07 - icommon_product1;
+ irot_input_y = id34 - icommon_product2;
+
+ /* Apply rotation for outputs 3 and 5. */
+ temp1=xC3S5*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC5S3*irot_input_y ;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[3*8] = (int16_t) (temp1 - temp2) ;
+
+ temp1=xC5S3*irot_input_x;
+ temp1=DOROUND(temp1);
+ temp1>>=16;
+ temp2=xC3S5*irot_input_y;
+ temp2=DOROUND(temp2);
+ temp2>>=16;
+ op[5*8] = (int16_t) (temp1 + temp2);
+
+ /* Increment data pointer for next column. */
+ ip ++;
+ op ++;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (fdct8x8theora_ref, fdct8x8theora);
+
diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c
new file mode 100644
index 0000000..6126adb
--- /dev/null
+++ b/liboil/dct/fdct8x8theora_i386.c
@@ -0,0 +1,357 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*==========================================================================
+ *
+ * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
+ * KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
+ * PURPOSE.
+ *
+ * Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
+ *
+ *--------------------------------------------------------------------------*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilfuncs.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
+static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
+static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
+static const __attribute__ ((aligned(8),used)) int64_t xC4S4 = 0x0b505b505b505b505LL;
+static const __attribute__ ((aligned(8),used)) int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
+static const __attribute__ ((aligned(8),used)) int64_t xC6S2 = 0x061f861f861f861f8LL;
+static const __attribute__ ((aligned(8),used)) int64_t xC7S1 = 0x031f131f131f131f1LL;
+
+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+OIL_DECLARE_CLASS(fdct8x8theora);
+
+/* execute stage 1 of forward DCT */
+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
+ " movq " #ip0 ", %%mm0 \n\t" \
+ " movq " #ip1 ", %%mm1 \n\t" \
+ " movq " #ip3 ", %%mm2 \n\t" \
+ " movq " #ip5 ", %%mm3 \n\t" \
+ " movq %%mm0, %%mm4 \n\t" \
+ " movq %%mm1, %%mm5 \n\t" \
+ " movq %%mm2, %%mm6 \n\t" \
+ " movq %%mm3, %%mm7 \n\t" \
+ \
+ " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
+ " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
+ " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
+ " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
+ " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
+ " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
+ \
+ " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
+ \
+ " paddsw %%mm2, %%mm2 \n\t" \
+ \
+ " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
+ \
+ " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
+ " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
+ " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
+ " paddsw %%mm3, %%mm3 \n\t" \
+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
+ \
+ " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
+ " paddsw %%mm7, %%mm7 \n\t" \
+ " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
+ /* ------------------------------------------------------------------- */ \
+ " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
+ " paddsw %%mm3, %%mm3 \n\t" \
+ \
+ " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
+ " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
+ \
+ " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
+ " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
+ \
+ " movq %%mm3, %%mm2 \n\t" \
+ " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
+ \
+ " movq %%mm3, %%mm0 \n\t" \
+ " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
+ \
+ " movq %%mm3," #ip0 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
+ " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
+ \
+ " movq " #temp ", %%mm2 \n\t" \
+ " movq %%mm2, %%mm0 \n\t" \
+ \
+ " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
+ " paddw %%mm0, %%mm3 \n\t" \
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ " movq %%mm5, %%mm0 \n\t" \
+ \
+ " movq %%mm5, %%mm2 \n\t" \
+ " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
+ \
+ " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
+ " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
+ \
+ " movq %%mm5, %%mm0 \n\t" \
+ " movq %%mm5, %%mm2 \n\t" \
+ \
+ " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " movq " #temp ", %%mm3 \n\t" \
+ " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
+ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
+ " movq %%mm3, %%mm2 \n\t" \
+ \
+ " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ " psubsw %%mm5, %%mm3 \n\t" \
+ \
+ " movq %%mm3," #ip6 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC4S4)", %%mm0 \n\t" \
+ " movq %%mm1, %%mm2 \n\t" \
+ " movq %%mm1, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
+ \
+ " movq %%mm7, %%mm2 \n\t" \
+ " movq %%mm7, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
+ " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
+ /* ------------------------------------------------------------------- */ \
+ " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
+ " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
+ \
+ " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
+ " paddsw %%mm6, %%mm6 \n\t" \
+ " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
+ \
+ " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
+ " paddsw %%mm1, %%mm1 \n\t" \
+ " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC1S7)", %%mm7 \n\t" \
+ " movq %%mm1, %%mm2 \n\t" \
+ \
+ " movq %%mm1, %%mm3 \n\t" \
+ " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
+ \
+ " movq "M(xC7S1)", %%mm7 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
+ " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
+ \
+ " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
+ \
+ " movq %%mm0, %%mm5 \n\t" \
+ " movq %%mm0, %%mm2 \n\t" \
+ \
+ " movq "M(xC1S7)", %%mm7 \n\t" \
+ " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
+ \
+ " movq "M(xC7S1)", %%mm7 \n\t" \
+ " psrlw $15, %%mm2 \n\t" \
+ \
+ " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
+ \
+ " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
+ \
+ " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
+ " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
+ \
+ " movq %%mm1," #ip1 " \n\t" \
+ " movq %%mm3," #ip7 " \n\t" \
+ /* ------------------------------------------------------------------- */ \
+ " movq "M(xC3S5)", %%mm0 \n\t" \
+ " movq "M(xC5S3)", %%mm1 \n\t" \
+ \
+ " movq %%mm6, %%mm5 \n\t" \
+ " movq %%mm6, %%mm7 \n\t" \
+ \
+ " movq %%mm4, %%mm2 \n\t" \
+ " movq %%mm4, %%mm3 \n\t" \
+ \
+ " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
+ " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
+ \
+ " psrlw $15, %%mm2 \n\t" \
+ " psrlw $15, %%mm5 \n\t" \
+ \
+ " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
+ " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
+ " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
+ \
+ " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
+ " movq %%mm4," #ip3 " \n\t" \
+ \
+ " movq %%mm3, %%mm4 \n\t" \
+ " movq %%mm7, %%mm6 \n\t" \
+ \
+ " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
+ \
+ " paddw %%mm2, %%mm4 \n\t" \
+ " paddw %%mm5, %%mm6 \n\t" \
+ \
+ " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
+ \
+ " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
+ " movq %%mm3," #ip5 " \n\t"
+
+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
+ op0,op1,op2,op3,op4,op5,op6,op7) \
+ " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
+ " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
+ " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
+ " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
+ " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
+ " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
+ " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
+ " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
+ " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
+ /* Transpose 2x8 block */ \
+ " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
+ " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
+ " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
+ " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
+ " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
+ " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
+ " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
+ " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
+ " movq %%mm4," #op4 " \n\t" \
+ " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
+ " movq %%mm5," #op5 " \n\t" \
+ " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
+ " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
+ " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
+ " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
+ " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
+ " movq %%mm6," #op7 " \n\t" \
+ " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
+ " movq %%mm1," #op6 " \n\t" \
+ " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
+ " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
+ " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
+ " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
+ " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
+ " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
+ " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
+ " movq %%mm0," #op0 " \n\t" \
+ " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
+ " movq %%mm1," #op1 " \n\t" \
+ " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
+ " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
+ " movq %%mm4," #op3 " \n\t" \
+ " movq %%mm2," #op2 " \n\t"
+
+
+static void
+fdct8x8theora_mmx(int16_t *src, int16_t *dest)
+{
+ int64_t __attribute__((aligned(8))) align_tmp[16];
+ int16_t *const temp= (int16_t*)align_tmp;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ /*
+ * Input data is an 8x8 block. To make processing of the data more efficent
+ * we will transpose the block of data to two 4x8 blocks???
+ */
+ Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
+ Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
+
+ Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+ Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+ Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
+
+ Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+ Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
+
+ " emms \n\t"
+
+ : "+r" (src),
+ "+r" (dest)
+ : "r" (temp)
+ : "memory"
+ );
+}
+
+OIL_DEFINE_IMPL_FULL (fdct8x8theora_mmx, fdct8x8theora, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/i386/Makefile.am b/liboil/i386/Makefile.am
new file mode 100644
index 0000000..1d66341
--- /dev/null
+++ b/liboil/i386/Makefile.am
@@ -0,0 +1,27 @@
+
+noinst_LTLIBRARIES = libi386.la
+
+sources = \
+ error8x8_i386.c \
+ recon8x8_i386.c \
+ rowcolsad8x8_i386.c \
+ sad8x8_i386.c \
+ sad8x8avg_i386.c \
+ diff8x8_i386.c
+
+if HAVE_CPU_I386
+i386_sources = $(sources)
+else
+i386_sources =
+endif
+
+if HAVE_CPU_AMD64
+amd64_sources = $(sources)
+else
+amd64_sources =
+endif
+
+libi386_la_SOURCES = \
+ $(i386_sources)
+libi386_la_CFLAGS = $(LIBOIL_CFLAGS)
+
diff --git a/liboil/i386/diff8x8_i386.c b/liboil/i386/diff8x8_i386.c
new file mode 100644
index 0000000..a0dc8ae
--- /dev/null
+++ b/liboil/i386/diff8x8_i386.c
@@ -0,0 +1,169 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS (diff8x8_s16_u8);
+OIL_DECLARE_CLASS (diff8x8_const128_s16_u8);
+OIL_DECLARE_CLASS (diff8x8_average_s16_u8);
+
+static const __attribute__ ((aligned(8),used)) int64_t V128w = 0x0080008000800080LL;
+
+#ifdef HAVE_LD_UNDERSCORE
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+static void
+diff8x8_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
+ " movq %%mm0, (%2) \n\t" /* write answer out */
+ " movq %%mm2, 8(%2) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %2 \n\t"
+ " add %3, %0 \n\t"
+ " add %4, %1 \n\t"
+ ".endr \n\t"
+ " emms \n\t"
+
+ : "+r" (src1),
+ "+r" (src2),
+ "+r" (dest)
+ : "m" (ss1),
+ "m" (ss2)
+ : "memory"
+ );
+}
+OIL_DEFINE_IMPL_FULL (diff8x8_s16_u8_mmx, diff8x8_s16_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+diff8x8_const128_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+ " movq "M(V128w)", %%mm1 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ /* start calculation */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
+ " movq %%mm0, (%1) \n\t" /* write answer out */
+ " movq %%mm2, 8(%1) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %1 \n\t"
+ " add %2, %0 \n\t"
+ ".endr \n\t"
+ " emms \n\t"
+
+ : "+r" (src1),
+ "+r" (dest)
+ : "r" (ss1)
+ : "memory"
+ );
+}
+OIL_DEFINE_IMPL_FULL (diff8x8_const128_s16_u8_mmx, diff8x8_const128_s16_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+diff8x8_average_s16_u8_mmx (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2, uint8_t *src3)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm7, %%mm7 \n\t"
+
+ ".rept 8 \n\t"
+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
+ " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
+ " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
+ /* convert from UINT8 to INT16 */
+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
+ " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
+ " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
+ /* average ReconPtr1 and ReconPtr2 */
+ " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
+ " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
+ " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
+ " movq %%mm0, (%3) \n\t" /* write answer out */
+ " movq %%mm2, 8(%3) \n\t" /* write answer out */
+ /* Increment pointers */
+ " add $16, %3 \n\t"
+ " add %4, %0 \n\t"
+ " add %5, %1 \n\t"
+ " add %5, %2 \n\t"
+ ".endr \n\t"
+ " emms \n\t"
+
+ : "+r" (src1),
+ "+r" (src2),
+ "+r" (src3),
+ "+r" (dest)
+ : "m" (ss1),
+ "m" (ss2)
+ : "memory"
+ );
+}
+OIL_DEFINE_IMPL_FULL (diff8x8_average_s16_u8_mmx, diff8x8_average_s16_u8, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/i386/error8x8_i386.c b/liboil/i386/error8x8_i386.c
new file mode 100644
index 0000000..079604a
--- /dev/null
+++ b/liboil/i386/error8x8_i386.c
@@ -0,0 +1,337 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS (err_intra8x8_u8);
+OIL_DECLARE_CLASS (err_inter8x8_u8);
+OIL_DECLARE_CLASS (err_inter8x8_u8_avg);
+
+static void
+err_intra8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1)
+{
+ uint32_t xsum;
+ uint32_t xxsum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq %%mm0, %%mm2 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %3, %2 \n\t" /* Inc pointer into src data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+ " emms \n\t"
+
+ : "=r" (xsum),
+ "=r" (xxsum),
+ "+r" (src1)
+ : "r" (ss1)
+ : "edi", "memory"
+ );
+
+ /* Compute population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+OIL_DEFINE_IMPL_FULL (err_intra8x8_u8_mmx, err_intra8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+err_inter8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ uint32_t xsum;
+ uint32_t xxsum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%3), %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm6, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm6, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %4, %2 \n\t" /* Inc pointer into src data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+ " emms \n\t"
+
+ : "=m" (xsum),
+ "=m" (xxsum),
+ "+r" (src1),
+ "+r" (src2)
+ : "m" (ss1),
+ "m" (ss2)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_mmx, err_inter8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+err_inter8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ uint32_t xsum;
+ uint32_t xxsum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
+ " paddb %%mm4, %%mm4 \n\t"
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%3), %%mm2 \n\t"
+ " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
+ " movq %%mm2, %%mm1 \n\t"
+ " pand %%mm3, %%mm1 \n\t"
+ " pxor %%mm2, %%mm3 \n\t"
+ " pand %%mm4, %%mm3 \n\t"
+ " psrlq $1, %%mm3 \n\t"
+ " paddb %%mm3, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm6, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm6, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %5, %2 \n\t" /* Inc pointer into src data */
+ " add %6, %3 \n\t" /* Inc pointer into ref data */
+ " add %6, %4 \n\t" /* Inc pointer into ref data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+ " emms \n\t"
+
+ : "=m" (xsum),
+ "=m" (xxsum),
+ "+r" (src1),
+ "+r" (src2),
+ "+r" (src3)
+ : "m" (ss1),
+ "m" (ss2)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+
+OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX);
+
+static void
+err_inter8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ uint32_t xsum;
+ uint32_t xxsum;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm4, %%mm4 \n\t"
+ " pxor %%mm5, %%mm5 \n\t"
+ " pxor %%mm6, %%mm6 \n\t"
+ " pxor %%mm7, %%mm7 \n\t"
+ " mov $8, %%edi \n\t"
+ "1: \n\t"
+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%3), %%mm2 \n\t"
+ " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
+ " pavgb %%mm2, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t"
+ " punpcklbw %%mm4, %%mm1 \n\t"
+ " punpckhbw %%mm6, %%mm2 \n\t"
+ " punpckhbw %%mm4, %%mm3 \n\t"
+
+ " psubsw %%mm1, %%mm0 \n\t"
+ " psubsw %%mm3, %%mm2 \n\t"
+
+ " paddw %%mm0, %%mm5 \n\t"
+ " paddw %%mm2, %%mm5 \n\t"
+
+ " pmaddwd %%mm0, %%mm0 \n\t"
+ " pmaddwd %%mm2, %%mm2 \n\t"
+
+ " paddd %%mm0, %%mm7 \n\t"
+ " paddd %%mm2, %%mm7 \n\t"
+
+ " add %5, %2 \n\t" /* Inc pointer into src data */
+ " add %6, %3 \n\t" /* Inc pointer into ref data */
+ " add %6, %4 \n\t" /* Inc pointer into ref data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $32, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movq %%mm5, %%mm0 \n\t"
+ " psrlq $16, %%mm5 \n\t"
+ " paddw %%mm0, %%mm5 \n\t"
+ " movd %%mm5, %%edi \n\t"
+ " movsx %%di, %%edi \n\t"
+ " movl %%edi, %0 \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddd %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %1 \n\t"
+ " emms \n\t"
+
+ : "=m" (xsum),
+ "=m" (xxsum),
+ "+r" (src1),
+ "+r" (src2),
+ "+r" (src3)
+ : "m" (ss1),
+ "m" (ss2)
+ : "edi", "memory"
+ );
+
+ /* Compute and return population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+
+OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
diff --git a/liboil/i386/recon8x8_i386.c b/liboil/i386/recon8x8_i386.c
new file mode 100644
index 0000000..91df0d6
--- /dev/null
+++ b/liboil/i386/recon8x8_i386.c
@@ -0,0 +1,165 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/simdpack/simdpack.h>
+
+OIL_DECLARE_CLASS (recon8x8_intra);
+OIL_DECLARE_CLASS (recon8x8_inter);
+OIL_DECLARE_CLASS (recon8x8_inter2);
+
+static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL;
+
+#ifdef HAVE_LD_UNDERSCORE
+# define M(a) "_" #a
+#else
+# define M(a) #a
+#endif
+
+static void
+recon8x8_intra_mmx (uint8_t *dest, int ds, int16_t *change)
+{
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
+
+ " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
+ "1: \n\t"
+ " movq (%1), %%mm2 \n\t" /* First four input values */
+
+ " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
+ " por %%mm0, %%mm0 \n\t"
+ " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
+ " lea 16(%1), %1 \n\t" /* Step source buffer */
+ " cmp %%edi, %1 \n\t" /* are we done */
+
+ " movq %%mm2, (%0) \n\t" /* store results */
+
+ " lea (%0, %2), %0 \n\t" /* Step output buffer */
+ " jc 1b \n\t" /* Loop back if we are not done */
+ " emms \n\t"
+ : "+r" (dest)
+ : "r" (change),
+ "r" (ds)
+ : "memory", "edi"
+ );
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_intra_mmx, recon8x8_intra, OIL_IMPL_FLAG_MMX);
+
+static void
+recon8x8_inter_mmx (uint8_t *dest, int ds, uint8_t *src, int ss, int16_t *change)
+{
+ /* FIXME doesn't handle ss */
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm0, %%mm0 \n\t"
+ " lea 128(%1), %%edi \n\t"
+
+ "1: \n\t"
+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
+
+ " movq (%1), %%mm4 \n\t" /* first 4 changes */
+ " movq %%mm2, %%mm3 \n\t"
+ " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
+ " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
+ " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
+ " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
+ " add %3, %2 \n\t" /* next row of reference pixels */
+ " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
+ " lea 16(%1), %1 \n\t" /* next row of changes */
+ " cmp %%edi, %1 \n\t" /* are we done? */
+
+ " movq %%mm2, (%0) \n\t" /* store result */
+
+ " lea (%0, %3), %0 \n\t" /* next row of output */
+ " jc 1b \n\t"
+ " emms \n\t"
+ : "+r" (dest)
+ : "r" (change),
+ "r" (src),
+ "r" (ds)
+ : "memory", "edi"
+ );
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter_mmx, recon8x8_inter, OIL_IMPL_FLAG_MMX);
+
+static void
+recon8x8_inter2_mmx (uint8_t *dest, int ds, uint8_t *s1, int ss1, uint8_t *s2, int ss2, int16_t *change)
+{
+ /* FIXME doesn't handle ss1, ss2 */
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm0, %%mm0 \n\t"
+ " lea 128(%1), %%edi \n\t"
+
+ "1: \n\t"
+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
+ " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
+
+ " movq %%mm2, %%mm3 \n\t"
+ " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
+ " movq %%mm4, %%mm5 \n\t"
+ " movq (%1), %%mm6 \n\t" /* first 4 changes */
+ " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
+ " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
+ " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
+ " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
+ " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
+ " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
+ " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
+ " paddw %%mm6, %%mm2 \n\t" /* add changes to start */
+ " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
+ " lea 16(%1), %1 \n\t" /* next row of changes */
+ " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */
+ " add %4, %2 \n\t" /* next row of reference pixels */
+ " add %4, %3 \n\t" /* next row of reference pixels */
+ " movq %%mm2, (%0) \n\t" /* store result */
+ " add %4, %0 \n\t" /* next row of output */
+ " cmp %%edi, %1 \n\t" /* are we done? */
+ " jc 1b \n\t"
+ " emms \n\t"
+ : "+r" (dest)
+ : "r" (change),
+ "r" (s1),
+ "r" (s2),
+ "m" (ds)
+ : "memory", "edi"
+ );
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter2_mmx, recon8x8_inter2, OIL_IMPL_FLAG_MMX);
diff --git a/liboil/i386/rowcolsad8x8_i386.c b/liboil/i386/rowcolsad8x8_i386.c
new file mode 100644
index 0000000..b05b8a5
--- /dev/null
+++ b/liboil/i386/rowcolsad8x8_i386.c
@@ -0,0 +1,280 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS (rowsad8x8_u8);
+OIL_DECLARE_CLASS (colsad8x8_u8);
+
+static void
+rowsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, uint8_t *src2)
+{
+ uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
+ " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+ " psrlq $32, %%mm2 \n\t" /* fold and add */
+ " psrlq $32, %%mm3 \n\t"
+ " paddw %%mm2, %%mm0 \n\t"
+ " paddw %%mm3, %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+ " movq %%mm1, %%mm3 \n\t"
+ " psrlq $16, %%mm2 \n\t"
+ " psrlq $16, %%mm3 \n\t"
+ " paddw %%mm2, %%mm0 \n\t"
+ " paddw %%mm3, %%mm1 \n\t"
+
+ " psubusw %%mm0, %%mm1 \n\t"
+ " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
+ " movd %%mm1, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=m" (MaxSad),
+ "+r" (src1),
+ "+r" (src2)
+ :
+ : "memory"
+ );
+ *dest = MaxSad;
+}
+OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmx, rowsad8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+rowsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, uint8_t *src2)
+{
+ uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " movd (%1), %%mm0 \n\t"
+ " movd (%2), %%mm1 \n\t"
+ " psadbw %%mm0, %%mm1 \n\t"
+ " movd 4(%1), %%mm2 \n\t"
+ " movd 4(%2), %%mm3 \n\t"
+ " psadbw %%mm2, %%mm3 \n\t"
+
+ " pmaxsw %%mm1, %%mm3 \n\t"
+ " movd %%mm3, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=m" (MaxSad),
+ "+r" (src1),
+ "+r" (src2)
+ :
+ : "memory"
+ );
+ *dest = MaxSad;
+}
+OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmxext, rowsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+colsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "2: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
+
+ " dec %%edi \n\t"
+ " jnz 2b \n\t"
+
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
+ " psubusw %%mm4, %%mm5 \n\t"
+ " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
+ " psubusw %%mm5, %%mm7 \n\t"
+ " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $32, %%mm6 \n\t"
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $16, %%mm6 \n\t"
+ " psubusw %%mm6, %%mm7 \n\t"
+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (MaxSad),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" (ss1)
+ : "memory", "edi"
+ );
+ *dest = MaxSad;
+}
+OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmx, colsad8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+colsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ uint32_t MaxSad;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " mov $4, %%edi \n\t" /* 4 rows */
+ "2: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */
+
+ " movq %%mm0, %%mm2 \n\t"
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " add %3, %2 \n\t" /* Inc pointer into the new data */
+
+ " dec %%edi \n\t"
+ " jnz 2b \n\t"
+
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " pmaxsw %%mm4, %%mm5 \n\t"
+ " pmaxsw %%mm5, %%mm7 \n\t"
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $32, %%mm6 \n\t"
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " movq %%mm7, %%mm6 \n\t"
+ " psrlq $16, %%mm6 \n\t"
+ " pmaxsw %%mm6, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (MaxSad),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" (ss1)
+ : "memory", "edi"
+ );
+
+ *dest = MaxSad;
+}
+OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmxext, colsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
diff --git a/liboil/i386/sad8x8_i386.c b/liboil/i386/sad8x8_i386.c
new file mode 100644
index 0000000..242aa8c
--- /dev/null
+++ b/liboil/i386/sad8x8_i386.c
@@ -0,0 +1,120 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS (sad8x8_u8);
+
+static void
+sad8x8_u8_mmx (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " movq %%mm0, %%mm2 \n\t"
+
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $16, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=m" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" (sstr1),
+ "r" (sstr2)
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmx, sad8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+sad8x8_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+ ".rept 7 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " add %3, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" (sstr1),
+ "r" (sstr2)
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
diff --git a/liboil/i386/sad8x8avg_i386.c b/liboil/i386/sad8x8avg_i386.c
new file mode 100644
index 0000000..0bae5f8
--- /dev/null
+++ b/liboil/i386/sad8x8avg_i386.c
@@ -0,0 +1,136 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DECLARE_CLASS (sad8x8_u8_avg);
+
+static void
+sad8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+
+ " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
+ " paddb %%mm5, %%mm5 \n\t"
+
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " mov $8, %%edi \n\t" /* 8 rows */
+ "1: \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+
+ " movq (%2), %%mm2 \n\t"
+ " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
+ " movq %%mm2, %%mm1 \n\t"
+ " pand %%mm3, %%mm1 \n\t"
+ " pxor %%mm2, %%mm3 \n\t"
+ " pand %%mm5, %%mm3 \n\t"
+ " psrlq $1, %%mm3 \n\t"
+ " paddb %%mm3, %%mm1 \n\t"
+
+ " movq %%mm0, %%mm2 \n\t"
+
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
+ " movq %%mm0, %%mm1 \n\t"
+
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
+ " add %4, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+ " add %5, %2 \n\t" /* Inc pointer into ref data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
+
+ " dec %%edi \n\t"
+ " jnz 1b \n\t"
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $16, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=m" (diff),
+ "+r" (src1),
+ "+r" (src2),
+ "+r" (src3)
+ : "m" (ss1),
+ "m" (ss2)
+ : "edi", "memory"
+ );
+ *dest = diff;
+}
+
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_avg_mmx, sad8x8_u8_avg, OIL_IMPL_FLAG_MMX);
+
+static void
+sad8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " .balign 16 \n\t"
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ ".rept 8 \n\t"
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " movq (%3), %%mm2 \n\t"
+ " pavgb %%mm2, %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+
+ " add %4, %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " add %5, %2 \n\t" /* Inc pointer into ref data */
+ " add %5, %3 \n\t" /* Inc pointer into ref data */
+ ".endr \n\t"
+
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+ : "=m" (diff),
+ "+r" (src1),
+ "+r" (src2),
+ "+r" (src3)
+ : "m" (ss1),
+ "m" (ss2)
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_avg_mmxext, sad8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
diff --git a/liboil/ref/Makefile.am b/liboil/ref/Makefile.am
new file mode 100644
index 0000000..3a4a788
--- /dev/null
+++ b/liboil/ref/Makefile.am
@@ -0,0 +1,26 @@
+
+if USE_ALT_OPT
+opt_libs = libref_opt1.la
+else
+opt_libs =
+endif
+
+noinst_LTLIBRARIES = libref.la $(opt_libs)
+
+c_sources = \
+ diff8x8.c \
+ error8x8.c \
+ recon8x8.c \
+ rowcolsad8x8.c \
+ sad8x8avg.c
+
+libref_la_SOURCES = \
+ $(c_sources)
+libref_la_LIBADD = \
+ $(opt_libs)
+libref_la_CFLAGS = $(LIBOIL_CFLAGS)
+
+libref_opt1_la_SOURCES = $(c_sources)
+libref_opt1_la_CFLAGS = $(LIBOIL_CFLAGS) \
+ $(LIBOIL_OPT_CFLAGS)
+
diff --git a/liboil/ref/diff8x8.c b/liboil/ref/diff8x8.c
new file mode 100644
index 0000000..ee518dc
--- /dev/null
+++ b/liboil/ref/diff8x8.c
@@ -0,0 +1,117 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+OIL_DEFINE_CLASS (diff8x8_s16_u8,
+ "int16_t *d_64, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2");
+OIL_DEFINE_CLASS (diff8x8_const128_s16_u8,
+ "int16_t *d_64, uint8_t *s1_8x8, int ss1");
+OIL_DEFINE_CLASS (diff8x8_average_s16_u8,
+ "int16_t *d_64, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2, uint8_t *s3_8x8, int ss3");
+
+static void
+diff8x8_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ int i;
+
+ /* For each block row */
+ for (i=0;i<8;i++ ){
+ dest[0] = ((int16_t)src1[0]) - ((int16_t)src2[0]);
+ dest[1] = ((int16_t)src1[1]) - ((int16_t)src2[1]);
+ dest[2] = ((int16_t)src1[2]) - ((int16_t)src2[2]);
+ dest[3] = ((int16_t)src1[3]) - ((int16_t)src2[3]);
+ dest[4] = ((int16_t)src1[4]) - ((int16_t)src2[4]);
+ dest[5] = ((int16_t)src1[5]) - ((int16_t)src2[5]);
+ dest[6] = ((int16_t)src1[6]) - ((int16_t)src2[6]);
+ dest[7] = ((int16_t)src1[7]) - ((int16_t)src2[7]);
+
+ /* Start next row */
+ src1 += ss1;
+ src2 += ss2;
+ dest += 8;
+ }
+}
+OIL_DEFINE_IMPL_REF (diff8x8_s16_u8_ref, diff8x8_s16_u8);
+
+static void
+diff8x8_const128_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1)
+{
+ int i;
+
+ /* For each block row */
+ for (i=0;i<8;i++ ){
+ dest[0] = ((int16_t)src1[0]) - 128;
+ dest[1] = ((int16_t)src1[1]) - 128;
+ dest[2] = ((int16_t)src1[2]) - 128;
+ dest[3] = ((int16_t)src1[3]) - 128;
+ dest[4] = ((int16_t)src1[4]) - 128;
+ dest[5] = ((int16_t)src1[5]) - 128;
+ dest[6] = ((int16_t)src1[6]) - 128;
+ dest[7] = ((int16_t)src1[7]) - 128;
+
+ /* Start next row */
+ src1 += ss1;
+ dest += 8;
+ }
+}
+OIL_DEFINE_IMPL_REF (diff8x8_const128_s16_u8_ref, diff8x8_const128_s16_u8);
+
+static void
+diff8x8_average_s16_u8_ref (int16_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2, uint8_t *src3, int ss3)
+{
+ int i;
+
+ /* For each block row */
+ for (i=0;i<8;i++ ){
+ dest[0] = ((int16_t)src1[0]) - ((((int16_t)src2[0]) + ((int16_t)src3[0])) / 2);
+ dest[1] = ((int16_t)src1[1]) - ((((int16_t)src2[1]) + ((int16_t)src3[1])) / 2);
+ dest[2] = ((int16_t)src1[2]) - ((((int16_t)src2[2]) + ((int16_t)src3[2])) / 2);
+ dest[3] = ((int16_t)src1[3]) - ((((int16_t)src2[3]) + ((int16_t)src3[3])) / 2);
+ dest[4] = ((int16_t)src1[4]) - ((((int16_t)src2[4]) + ((int16_t)src3[4])) / 2);
+ dest[5] = ((int16_t)src1[5]) - ((((int16_t)src2[5]) + ((int16_t)src3[5])) / 2);
+ dest[6] = ((int16_t)src1[6]) - ((((int16_t)src2[6]) + ((int16_t)src3[6])) / 2);
+ dest[7] = ((int16_t)src1[7]) - ((((int16_t)src2[7]) + ((int16_t)src3[7])) / 2);
+
+ /* Start next row */
+ src1 += ss1;
+ src2 += ss2;
+ src3 += ss3;
+ dest += 8;
+ }
+}
+OIL_DEFINE_IMPL_REF (diff8x8_average_s16_u8_ref, diff8x8_average_s16_u8);
+
+
+
+
+
+
diff --git a/liboil/ref/error8x8.c b/liboil/ref/error8x8.c
new file mode 100644
index 0000000..825d335
--- /dev/null
+++ b/liboil/ref/error8x8.c
@@ -0,0 +1,181 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+
+
+OIL_DEFINE_CLASS (err_intra8x8_u8,
+ "uint32_t *d_1, uint8_t *s1_8x8, int ss1");
+OIL_DEFINE_CLASS (err_inter8x8_u8,
+ "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2");
+OIL_DEFINE_CLASS (err_inter8x8_u8_avg,
+ "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, uint8_t *s3_8x8, int ss2");
+
+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
+
+static void
+err_intra8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1)
+{
+ uint32_t i;
+ uint32_t xsum=0;
+ uint32_t xxsum=0;
+
+ for (i=8; i; i--) {
+ /* Examine alternate pixel locations. */
+ xsum += src1[0];
+ xxsum += src1[0]*src1[0];
+ xsum += src1[1];
+ xxsum += src1[1]*src1[1];
+ xsum += src1[2];
+ xxsum += src1[2]*src1[2];
+ xsum += src1[3];
+ xxsum += src1[3]*src1[3];
+ xsum += src1[4];
+ xxsum += src1[4]*src1[4];
+ xsum += src1[5];
+ xxsum += src1[5]*src1[5];
+ xsum += src1[6];
+ xxsum += src1[6]*src1[6];
+ xsum += src1[7];
+ xxsum += src1[7]*src1[7];
+
+ /* Step to next row of block. */
+ src1 += ss1;
+ }
+ /* Compute population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum ));
+}
+OIL_DEFINE_IMPL_REF (err_intra8x8_u8_ref, err_intra8x8_u8);
+
+static void
+err_inter8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ uint32_t i;
+ uint32_t xsum=0;
+ uint32_t xxsum=0;
+ int32_t diff;
+
+ for (i=8; i; i--) {
+ diff = DSP_OP_DIFF (src1[0], src2[0]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[1], src2[1]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[2], src2[2]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[3], src2[3]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[4], src2[4]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[5], src2[5]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[6], src2[6]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF (src1[7], src2[7]);
+ xsum += diff;
+ xxsum += diff*diff;
+
+ /* Step to next row of block. */
+ src1 += ss1;
+ src2 += ss2;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+OIL_DEFINE_IMPL_REF (err_inter8x8_u8_ref, err_inter8x8_u8);
+
+static void
+err_inter8x8_u8_avg_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ uint32_t i;
+ uint32_t xsum=0;
+ uint32_t xxsum=0;
+ int32_t diff;
+
+ for (i=8; i; i--) {
+ diff = DSP_OP_DIFF(src1[0], DSP_OP_AVG (src2[0], src3[0]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[1], DSP_OP_AVG (src2[1], src3[1]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[2], DSP_OP_AVG (src2[2], src3[2]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[3], DSP_OP_AVG (src2[3], src3[3]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[4], DSP_OP_AVG (src2[4], src3[4]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[5], DSP_OP_AVG (src2[5], src3[5]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[6], DSP_OP_AVG (src2[6], src3[6]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ diff = DSP_OP_DIFF(src1[7], DSP_OP_AVG (src2[7], src3[7]));
+ xsum += diff;
+ xxsum += diff*diff;
+
+ /* Step to next row of block. */
+ src1 += ss1;
+ src2 += ss2;
+ src3 += ss2;
+ }
+
+ /* Compute and return population variance as mis-match metric. */
+ *dest = (((xxsum<<6) - xsum*xsum));
+}
+
+OIL_DEFINE_IMPL_REF (err_inter8x8_u8_avg_ref, err_inter8x8_u8_avg);
diff --git a/liboil/ref/recon8x8.c b/liboil/ref/recon8x8.c
new file mode 100644
index 0000000..0455198
--- /dev/null
+++ b/liboil/ref/recon8x8.c
@@ -0,0 +1,112 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/simdpack/simdpack.h>
+#include <liboil/liboilcolorspace.h>
+
+
+OIL_DEFINE_CLASS (recon8x8_intra,
+ "uint8_t *d_8x8, int ds, int16_t *s_8x8");
+OIL_DEFINE_CLASS (recon8x8_inter,
+ "uint8_t *d_8x8, int ds, uint8_t *s1_8x8, int ss1, int16_t *s2_8x8");
+OIL_DEFINE_CLASS (recon8x8_inter2,
+ "uint8_t *d_8x8, int ds, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2, int16_t *s3_8x8");
+
+
+static void
+recon8x8_intra_ref (uint8_t *dest, int ds, int16_t *change)
+{
+ uint32_t i;
+
+ for (i = 8; i; i--){
+ dest[0] = oil_clamp_255(change[0] + 128);
+ dest[1] = oil_clamp_255(change[1] + 128);
+ dest[2] = oil_clamp_255(change[2] + 128);
+ dest[3] = oil_clamp_255(change[3] + 128);
+ dest[4] = oil_clamp_255(change[4] + 128);
+ dest[5] = oil_clamp_255(change[5] + 128);
+ dest[6] = oil_clamp_255(change[6] + 128);
+ dest[7] = oil_clamp_255(change[7] + 128);
+
+ dest += ds;
+ change += 8;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (recon8x8_intra_ref, recon8x8_intra);
+
+static void
+recon8x8_inter_ref (uint8_t *dest, int ds, uint8_t *src, int ss, int16_t *change, int dss)
+{
+ uint32_t i;
+
+ for (i = 8; i; i--){
+ dest[0] = oil_clamp_255(src[0] + change[0]);
+ dest[1] = oil_clamp_255(src[1] + change[1]);
+ dest[2] = oil_clamp_255(src[2] + change[2]);
+ dest[3] = oil_clamp_255(src[3] + change[3]);
+ dest[4] = oil_clamp_255(src[4] + change[4]);
+ dest[5] = oil_clamp_255(src[5] + change[5]);
+ dest[6] = oil_clamp_255(src[6] + change[6]);
+ dest[7] = oil_clamp_255(src[7] + change[7]);
+
+ change += 8;
+ dest += ds;
+ src += ss;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (recon8x8_inter_ref, recon8x8_inter);
+
+static void
+recon8x8_inter2_ref (uint8_t *dest, int ds, uint8_t *s1, int ss1, uint8_t *s2, int ss2, int16_t *change)
+{
+ uint32_t i;
+
+ for (i = 8; i; i--){
+ dest[0] = oil_clamp_255((((int16_t)s1[0] + (int16_t)s2[0]) >> 1) + change[0]);
+ dest[1] = oil_clamp_255((((int16_t)s1[1] + (int16_t)s2[1]) >> 1) + change[1]);
+ dest[2] = oil_clamp_255((((int16_t)s1[2] + (int16_t)s2[2]) >> 1) + change[2]);
+ dest[3] = oil_clamp_255((((int16_t)s1[3] + (int16_t)s2[3]) >> 1) + change[3]);
+ dest[4] = oil_clamp_255((((int16_t)s1[4] + (int16_t)s2[4]) >> 1) + change[4]);
+ dest[5] = oil_clamp_255((((int16_t)s1[5] + (int16_t)s2[5]) >> 1) + change[5]);
+ dest[6] = oil_clamp_255((((int16_t)s1[6] + (int16_t)s2[6]) >> 1) + change[6]);
+ dest[7] = oil_clamp_255((((int16_t)s1[7] + (int16_t)s2[7]) >> 1) + change[7]);
+
+ change += 8;
+ dest += ds;
+ s1 += ss1;
+ s2 += ss2;
+ }
+}
+
+OIL_DEFINE_IMPL_REF (recon8x8_inter2_ref, recon8x8_inter2);
diff --git a/liboil/ref/rowcolsad8x8.c b/liboil/ref/rowcolsad8x8.c
new file mode 100644
index 0000000..ecddded
--- /dev/null
+++ b/liboil/ref/rowcolsad8x8.c
@@ -0,0 +1,110 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/simdpack/simdpack.h>
+#include <math.h>
+
+#define ABS(x) ((x)>0 ? (x) : -(x))
+#define DSP_OP_ABS_DIFF(a,b) ABS((((int)(a)) - ((int)(b))))
+
+OIL_DEFINE_CLASS (rowsad8x8_u8,
+ "uint32_t *d_1, uint8_t *s1_8x8, uint8_t *s2_8x8");
+OIL_DEFINE_CLASS (colsad8x8_u8,
+ "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, int ss2");
+
+static void
+rowsad8x8_u8_ref (uint32_t *dest, uint8_t *src1, uint8_t *src2)
+{
+ uint32_t SadValue;
+ uint32_t SadValue1;
+
+ SadValue = DSP_OP_ABS_DIFF (src1[0], src2[0]) +
+ DSP_OP_ABS_DIFF (src1[1], src2[1]) +
+ DSP_OP_ABS_DIFF (src1[2], src2[2]) +
+ DSP_OP_ABS_DIFF (src1[3], src2[3]);
+
+ SadValue1 = DSP_OP_ABS_DIFF (src1[4], src2[4]) +
+ DSP_OP_ABS_DIFF (src1[5], src2[5]) +
+ DSP_OP_ABS_DIFF (src1[6], src2[6]) +
+ DSP_OP_ABS_DIFF (src1[7], src2[7]);
+
+ *dest = (SadValue > SadValue1) ? SadValue : SadValue1;
+}
+OIL_DEFINE_IMPL_REF (rowsad8x8_u8_ref, rowsad8x8_u8);
+
+static void
+colsad8x8_u8_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2)
+{
+ uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
+ uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
+ uint32_t MaxSad = 0;
+ uint32_t i;
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue[0] += ABS(src1[0] - src2[0]);
+ SadValue[1] += ABS(src1[1] - src2[1]);
+ SadValue[2] += ABS(src1[2] - src2[2]);
+ SadValue[3] += ABS(src1[3] - src2[3]);
+ SadValue[4] += ABS(src1[4] - src2[4]);
+ SadValue[5] += ABS(src1[5] - src2[5]);
+ SadValue[6] += ABS(src1[6] - src2[6]);
+ SadValue[7] += ABS(src1[7] - src2[7]);
+
+ src1 += ss1;
+ src2 += ss2;
+ }
+
+ for ( i = 0; i < 4; i++ ){
+ SadValue2[0] += ABS(src1[0] - src2[0]);
+ SadValue2[1] += ABS(src1[1] - src2[1]);
+ SadValue2[2] += ABS(src1[2] - src2[2]);
+ SadValue2[3] += ABS(src1[3] - src2[3]);
+ SadValue2[4] += ABS(src1[4] - src2[4]);
+ SadValue2[5] += ABS(src1[5] - src2[5]);
+ SadValue2[6] += ABS(src1[6] - src2[6]);
+ SadValue2[7] += ABS(src1[7] - src2[7]);
+
+ src1 += ss1;
+ src2 += ss2;
+ }
+
+ for ( i = 0; i < 8; i++ ){
+ if ( SadValue[i] > MaxSad )
+ MaxSad = SadValue[i];
+ if ( SadValue2[i] > MaxSad )
+ MaxSad = SadValue2[i];
+ }
+
+ *dest = MaxSad;
+}
+OIL_DEFINE_IMPL_REF (colsad8x8_u8_ref, colsad8x8_u8);
+
diff --git a/liboil/ref/sad8x8avg.c b/liboil/ref/sad8x8avg.c
new file mode 100644
index 0000000..846547b
--- /dev/null
+++ b/liboil/ref/sad8x8avg.c
@@ -0,0 +1,66 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <math.h>
+
+#define ABS(x) ((x)>0 ? (x) : -(x))
+
+OIL_DEFINE_CLASS (sad8x8_u8_avg,
+ "uint32_t *d_1, uint8_t *s1_8x8, int ss1, uint8_t *s2_8x8, uint8_t *s3_8x8, int ss2");
+
+static void
+sad8x8_u8_avg_ref (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2)
+{
+ int i;
+ uint32_t diff = 0;
+
+ for (i=0; i<8;i++){
+ diff += ABS(((int)src1[0]) - (((int)src2[0] + (int)src3[0]) / 2));
+ diff += ABS(((int)src1[1]) - (((int)src2[1] + (int)src3[1]) / 2));
+ diff += ABS(((int)src1[2]) - (((int)src2[2] + (int)src3[2]) / 2));
+ diff += ABS(((int)src1[3]) - (((int)src2[3] + (int)src3[3]) / 2));
+ diff += ABS(((int)src1[4]) - (((int)src2[4] + (int)src3[4]) / 2));
+ diff += ABS(((int)src1[5]) - (((int)src2[5] + (int)src3[5]) / 2));
+ diff += ABS(((int)src1[6]) - (((int)src2[6] + (int)src3[6]) / 2));
+ diff += ABS(((int)src1[7]) - (((int)src2[7] + (int)src3[7]) / 2));
+
+ /* Step to next row of block. */
+ src1 += ss1;
+ src2 += ss2;
+ src3 += ss2;
+ }
+ *dest = diff;
+}
+
+OIL_DEFINE_IMPL_REF (sad8x8_u8_avg_ref, sad8x8_u8_avg);
+
+