summaryrefslogtreecommitdiff
path: root/liboil/i386_amd64
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2007-06-13 23:40:02 +0000
committerDavid Schleef <ds@schleef.org>2007-06-13 23:40:02 +0000
commitaea34d1babca1373a1a32f6d7162d52c41193604 (patch)
tree7422e0cb443b3124145a7258868bcfa2aa276a2d /liboil/i386_amd64
parenta65707e581b676152474147439579750c11d459c (diff)
downloadliboil-aea34d1babca1373a1a32f6d7162d52c41193604.tar.gz
* liboil/amd64/wavelet.c:
* liboil/i386/Makefile.am: * liboil/i386/convert_i386.c: * liboil/i386/math.c: * liboil/i386/sad8x8_i386.c: * liboil/i386_amd64/Makefile.am: * liboil/i386_amd64/convert.c: * liboil/i386_amd64/copy.c: * liboil/i386_amd64/math.c: * liboil/i386_amd64/sad8x8.c: Convert a bunch of stuff to dual-arch.
Diffstat (limited to 'liboil/i386_amd64')
-rw-r--r--liboil/i386_amd64/Makefile.am4
-rw-r--r--liboil/i386_amd64/convert.c137
-rw-r--r--liboil/i386_amd64/copy.c435
-rw-r--r--liboil/i386_amd64/math.c134
-rw-r--r--liboil/i386_amd64/sad8x8.c490
5 files changed, 1200 insertions, 0 deletions
diff --git a/liboil/i386_amd64/Makefile.am b/liboil/i386_amd64/Makefile.am
index f5412cc..89dc2f6 100644
--- a/liboil/i386_amd64/Makefile.am
+++ b/liboil/i386_amd64/Makefile.am
@@ -3,9 +3,13 @@ noinst_LTLIBRARIES = libi386_amd64.la
libi386_amd64_la_SOURCES = \
clamp.c \
+ convert.c \
+ copy.c \
idct8x8_i386.c \
+ math.c \
mt19937.c \
resample.c \
+ sad8x8.c \
sum.c \
swab.c
diff --git a/liboil/i386_amd64/convert.c b/liboil/i386_amd64/convert.c
new file mode 100644
index 0000000..cf8e126
--- /dev/null
+++ b/liboil/i386_amd64/convert.c
@@ -0,0 +1,137 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2006 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+static void
+convert_u8_s16_mmx (uint8_t * dest, const int16_t * src, int n)
+{
+ while(n&7) {
+ int x;
+ x = src[0];
+ if (x<0) x = 0;
+ if (x>255) x = 255;
+ dest[0] = x;
+ src++;
+ dest++;
+ n--;
+ }
+
+ n>>=3;
+ __asm__ __volatile__ ("\n"
+ "1:\n"
+ " movq 0(%1), %%mm0\n"
+ " packuswb 8(%1), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " add $16, %1\n"
+ " add $8, %0\n"
+ " decl %2\n"
+ " jg 1b\n"
+ " emms\n"
+ : "+r" (dest), "+r" (src), "+r" (n));
+}
+OIL_DEFINE_IMPL_FULL (convert_u8_s16_mmx, convert_u8_s16, OIL_IMPL_FLAG_MMX);
+
+static void
+convert_u8_s16_mmx_2 (uint8_t * dest, const int16_t * src, int n)
+{
+ while(n&7) {
+ int x;
+ x = src[0];
+ if (x<0) x = 0;
+ if (x>255) x = 255;
+ dest[0] = x;
+ src++;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+
+ n>>=3;
+ if (n&1) {
+ __asm__ __volatile__ ("\n"
+ " movq 0(%1), %%mm0\n"
+ " packuswb 8(%1), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " add $16, %1\n"
+ " add $8, %0\n"
+ : "+r" (dest), "+r" (src), "+r" (n));
+ }
+
+ n >>= 1;
+ if (n > 0) {
+ __asm__ __volatile__ ("\n"
+ "2:\n"
+ " movq 0(%1), %%mm0\n"
+ " packuswb 8(%1), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " movq 16(%1), %%mm0\n"
+ " packuswb 24(%1), %%mm0\n"
+ " movq %%mm0, 8(%0)\n"
+ " add $32, %1\n"
+ " add $16, %0\n"
+ " decl %2\n"
+ " jg 2b\n"
+ : "+r" (dest), "+r" (src), "+r" (n));
+ }
+ __asm__ __volatile__ ("emms\n");
+}
+OIL_DEFINE_IMPL_FULL (convert_u8_s16_mmx_2, convert_u8_s16, OIL_IMPL_FLAG_MMX);
+
+static void
+convert_s16_u8_mmx (int16_t * dest, const uint8_t * src, int n)
+{
+ while(n&7) {
+ dest[0] = src[0];
+ src++;
+ dest++;
+ n--;
+ }
+
+ n>>=3;
+ __asm__ __volatile__ ("\n"
+ " pxor %%mm0, %%mm0\n"
+ "1:\n"
+ " movd 0(%1), %%mm1\n"
+ " punpcklbw %%mm0, %%mm1\n"
+ " movq %%mm1, 0(%0)\n"
+ " movd 4(%1), %%mm2\n"
+ " punpcklbw %%mm0, %%mm2\n"
+ " movq %%mm2, 8(%0)\n"
+ " add $8, %1\n"
+ " add $16, %0\n"
+ " decl %2\n"
+ " jg 1b\n"
+ " emms\n"
+ : "+r" (dest), "+r" (src), "+r" (n));
+}
+OIL_DEFINE_IMPL_FULL (convert_s16_u8_mmx, convert_s16_u8, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/i386_amd64/copy.c b/liboil/i386_amd64/copy.c
new file mode 100644
index 0000000..9c7f55a
--- /dev/null
+++ b/liboil/i386_amd64/copy.c
@@ -0,0 +1,435 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+#include <stddef.h>
+
+
+static void
+copy_u8_i386_mmx (uint8_t *dest, uint8_t *src, int n)
+{
+ ptrdiff_t i = 0;
+
+ if (n&4) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ while(n&0x7) {
+ *dest++ = *src++;
+ n--;
+ }
+ if (n) asm volatile (
+ "1:\n"
+ " movq (%[src],%[i]), %%mm0\n"
+ " movq %%mm0, (%[dest],%[i])\n"
+ " add $8, %[i]\n"
+ " decl %[n]\n"
+ " jne 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [src] "+r" (src),
+ [i] "+r" (i)
+ : [n] "c" (n>>3));
+}
+OIL_DEFINE_IMPL_FULL (copy_u8_i386_mmx, copy_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+copy_u8_mmx2 (uint8_t *dest, uint8_t *src, int n)
+{
+ ptrdiff_t i = 0;
+
+ while (n&0xc) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ while(n&0xf) {
+ *dest++ = *src++;
+ n--;
+ }
+ if (n) asm volatile (
+ "1:\n"
+ " movq (%[src],%[i]), %%mm0\n"
+ " movq %%mm0, (%[dest],%[i])\n"
+ " movq 8(%[src],%[i]), %%mm1\n"
+ " movq %%mm1, 8(%[dest],%[i])\n"
+ " add $16, %[i]\n"
+ " decl %[n]\n"
+ " jne 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [src] "+r" (src),
+ [i] "+r" (i)
+ : [n] "c" (n>>4));
+}
+OIL_DEFINE_IMPL_FULL (copy_u8_mmx2, copy_u8, OIL_IMPL_FLAG_MMX);
+
+#if 0
+static void
+copy_u8_mmx3 (uint8_t *dest, uint8_t *src, int n)
+{
+ /* make sure destination is cache-line aligned for output */
+ if (n < 64) {
+ while (n>0) {
+ *dest++ = *src++;
+ n--;
+ }
+ return;
+ }
+ while (((unsigned long)dest) & 0x3) {
+ *dest++ = *src++;
+ n--;
+ }
+ while (((unsigned long)dest) & 0x3f) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ if (n > 64) asm volatile (
+ " mov $0, %%eax\n"
+ "1:\n"
+ //" prefetchnta 128(%1,%%eax)\n"
+ " movq (%1,%%eax), %%mm0\n"
+ " movq 8(%1,%%eax), %%mm1\n"
+ " movq 16(%1,%%eax), %%mm2\n"
+ " movq 24(%1,%%eax), %%mm3\n"
+ " movq 32(%1,%%eax), %%mm4\n"
+ " movq 40(%1,%%eax), %%mm5\n"
+ " movq 48(%1,%%eax), %%mm6\n"
+ " movq 56(%1,%%eax), %%mm7\n"
+ " movntq %%mm0, (%0,%%eax)\n"
+ " movntq %%mm1, 8(%0,%%eax)\n"
+ " movntq %%mm2, 16(%0,%%eax)\n"
+ " movntq %%mm3, 24(%0,%%eax)\n"
+ " movntq %%mm4, 32(%0,%%eax)\n"
+ " movntq %%mm5, 40(%0,%%eax)\n"
+ " movntq %%mm6, 48(%0,%%eax)\n"
+ " movntq %%mm7, 56(%0,%%eax)\n"
+ " add $64, %%eax\n"
+ " decl %%ecx\n"
+ " jne 1b\n"
+ " sfence\n"
+ " emms\n"
+ : "+r" (dest), "+r" (src)
+ : "c" (n>>6)
+ : "eax");
+
+ dest += n&(~(0x3f));
+ src += n&(~(0x3f));
+ n &= 0x3f;
+ while (n > 3) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ while (n > 0) {
+ *dest++ = *src++;
+ n--;
+ }
+}
+OIL_DEFINE_IMPL_FULL (copy_u8_mmx3, copy_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+#if 0
+static void
+copy_u8_mmx4 (uint8_t *dest, uint8_t *src, int n)
+{
+ /* make sure destination is cache-line aligned for output */
+ if (n < 32) {
+ while (n>0) {
+ *dest++ = *src++;
+ n--;
+ }
+ return;
+ }
+ while (((unsigned long)dest) & 0x3) {
+ *dest++ = *src++;
+ n--;
+ }
+ while (((unsigned long)dest) & 0x1f) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ if (n > 32) asm volatile (
+ " mov $0, %%eax\n"
+ "1:\n"
+ //" prefetchnta 128(%1,%%eax)\n"
+ " movq (%1,%%eax), %%mm0\n"
+ " movq 8(%1,%%eax), %%mm1\n"
+ " movq 16(%1,%%eax), %%mm2\n"
+ " movq 24(%1,%%eax), %%mm3\n"
+ " movntq %%mm0, (%0,%%eax)\n"
+ " movntq %%mm1, 8(%0,%%eax)\n"
+ " movntq %%mm2, 16(%0,%%eax)\n"
+ " movntq %%mm3, 24(%0,%%eax)\n"
+ " add $32, %%eax\n"
+ " decl %%ecx\n"
+ " jne 1b\n"
+ " sfence\n"
+ " emms\n"
+ : "+r" (dest), "+r" (src)
+ : "c" (n>>5)
+ : "eax");
+
+ dest += n&(~(0x1f));
+ src += n&(~(0x1f));
+ n &= 0x1f;
+ while (n > 3) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ while (n > 0) {
+ *dest++ = *src++;
+ n--;
+ }
+}
+OIL_DEFINE_IMPL_FULL (copy_u8_mmx4, copy_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+#endif
+
+static void
+copy_u8_mmx5 (uint8_t *dest, uint8_t *src, int n)
+{
+ ptrdiff_t i = 0;
+
+ while (n&0xc) {
+ *(uint32_t *)dest = *(uint32_t *)src;
+ dest += 4;
+ src += 4;
+ n-=4;
+ }
+ while(n&0xf) {
+ *dest++ = *src++;
+ n--;
+ }
+ if (n) asm volatile (
+ "1:\n"
+ " movq (%[src],%[i]), %%mm0\n"
+ " movq 8(%[src],%[i]), %%mm1\n"
+ " movq %%mm0, (%[dest],%[i])\n"
+ " movq %%mm1, 8(%[dest],%[i])\n"
+ " add $16, %[i]\n"
+ " decl %[n]\n"
+ " jne 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [src] "+r" (src),
+ [i] "+r" (i)
+ : [n] "c" (n>>4));
+}
+OIL_DEFINE_IMPL_FULL (copy_u8_mmx5, copy_u8, OIL_IMPL_FLAG_MMX);
+
+
+static void splat_u8_ns_i386_mmx (uint8_t *dest, const uint8_t *param, int n)
+{
+ uint32_t p;
+ while(n&7) {
+ *dest = *param;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+ n >>= 3;
+ p = (*param<<24) | (*param<<16) | (*param<<8) | (*param);
+ asm volatile (
+ " movd %2, %%mm0\n"
+ " punpcklbw %%mm0, %%mm0\n"
+ "1:\n"
+ " movq %%mm0, (%0)\n"
+ " add $8, %0\n"
+ " decl %1\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (dest), "+r" (n), "+r" (p));
+}
+OIL_DEFINE_IMPL_FULL (splat_u8_ns_i386_mmx, splat_u8_ns, OIL_IMPL_FLAG_MMX);
+
+static void splat_u8_ns_mmx2 (uint8_t *dest, const uint8_t *param, int n)
+{
+ uint32_t p;
+ while(n&15) {
+ *dest = *param;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+ n >>= 4;
+ p = (*param<<24) | (*param<<16) | (*param<<8) | (*param);
+ asm volatile (
+ " movd %2, %%mm0\n"
+ " punpcklbw %%mm0, %%mm0\n"
+ "1:\n"
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+ " add $16, %0\n"
+ " decl %1\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (dest), "+r" (n), "+r" (p));
+}
+OIL_DEFINE_IMPL_FULL(splat_u8_ns_mmx2, splat_u8_ns, OIL_IMPL_FLAG_MMX);
+
+static void splat_u8_ns_mmx2a (uint8_t *dest, const uint8_t *param, int n)
+{
+ uint32_t p;
+ ptrdiff_t tmp;
+
+ p = *param;
+ p |= p<<8;
+ p |= p<<16;
+ if (n<16) {
+ while(n>0) {
+ *dest = *param;
+ dest++;
+ n--;
+ }
+ return;
+ }
+ asm volatile (
+ " movd %2, %%mm0\n"
+ " punpcklbw %%mm0, %%mm0\n"
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+#ifdef __i386__
+ " mov %1, %[tmp]\n"
+#elif defined(__amd64__)
+ /* ugh this is gross */
+ " mov %1, %%eax\n"
+ " cltq\n"
+#else
+#error
+#endif
+ " and $0xf, %[tmp]\n"
+ " add %[tmp], %0\n"
+ " shr $4, %1\n"
+ "1:\n"
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+ " add $16, %0\n"
+ " decl %1\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (dest),
+ "+r" (n),
+ "+r" (p),
+ [tmp] "=a" (tmp)
+ : );
+}
+OIL_DEFINE_IMPL_FULL(splat_u8_ns_mmx2a, splat_u8_ns, OIL_IMPL_FLAG_MMX);
+
+static void splat_u16_ns_mmx (uint16_t *dest, const uint16_t *src, int n)
+{
+ while(n&3) {
+ *dest = *src;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+ n >>= 2;
+ asm volatile (
+ " movzwl 0(%[src]), %%ecx\n"
+ " movd %%ecx, %%mm0\n"
+ " pshufw $00, %%mm0, %%mm0\n"
+ "1:\n"
+ " movq %%mm0, (%[dest])\n"
+ " add $8, %0\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [n] "+r" (n)
+ : [src] "r" (src));
+}
+OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx, splat_u16_ns, OIL_IMPL_FLAG_MMX);
+
+static void splat_u16_ns_mmx_2 (uint16_t *dest, const uint16_t *src, int n)
+{
+ while(n&7) {
+ *dest = *src;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+ n >>= 3;
+ asm volatile (
+ " movzwl 0(%[src]), %%ecx\n"
+ " movd %%ecx, %%mm0\n"
+ " pshufw $00, %%mm0, %%mm0\n"
+ "1:\n"
+ " movq %%mm0, 0(%[dest])\n"
+ " movq %%mm0, 8(%[dest])\n"
+ " add $16, %0\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [n] "+r" (n)
+ : [src] "r" (src));
+}
+OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx_2, splat_u16_ns, OIL_IMPL_FLAG_MMX);
+
+static void splat_u16_ns_mmx_3 (uint16_t *dest, const uint16_t *src, int n)
+{
+ while(n&15) {
+ *dest = *src;
+ dest++;
+ n--;
+ }
+ if (n==0) return;
+ n >>= 4;
+ asm volatile (
+ " movzwl 0(%[src]), %%ecx\n"
+ " movd %%ecx, %%mm0\n"
+ " pshufw $00, %%mm0, %%mm0\n"
+ "1:\n"
+ " movq %%mm0, 0(%[dest])\n"
+ " movq %%mm0, 8(%[dest])\n"
+ " movq %%mm0, 16(%[dest])\n"
+ " movq %%mm0, 24(%[dest])\n"
+ " add $32, %0\n"
+ " decl %[n]\n"
+ " jnz 1b\n"
+ " emms\n"
+ : [dest] "+r" (dest),
+ [n] "+r" (n)
+ : [src] "r" (src));
+}
+OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx_3, splat_u16_ns, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/i386_amd64/math.c b/liboil/i386_amd64/math.c
new file mode 100644
index 0000000..3db6112
--- /dev/null
+++ b/liboil/i386_amd64/math.c
@@ -0,0 +1,134 @@
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+
+void
+add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int n)
+{
+ while(n&3) {
+ d1[0] = s1[0] + s2[0];
+ d1++;
+ s1++;
+ s2++;
+ n--;
+ }
+ n>>=2;
+ if (n==0) return;
+ asm volatile ("\n"
+ "1:\n"
+ " movq 0(%2), %%mm0\n"
+ " paddw 0(%1), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " add $8, %0\n"
+ " add $8, %1\n"
+ " add $8, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+ );
+
+}
+OIL_DEFINE_IMPL_FULL (add_s16_mmx, add_s16, OIL_IMPL_FLAG_MMX);
+
+void
+add_s16_u8_mmx(int16_t *d1, int16_t *s1, uint8_t *s2, int n)
+{
+ while(n&7) {
+ d1[0] = s1[0] + s2[0];
+ d1++;
+ s1++;
+ s2++;
+ n--;
+ }
+ n>>=3;
+ if (n==0) return;
+ asm volatile ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movq 0(%2), %%mm0\n"
+ " movq 0(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " punpckhbw %%mm7, %%mm1\n"
+ " paddw 0(%1), %%mm0\n"
+ " paddw 8(%1), %%mm1\n"
+ " movq %%mm0, 0(%0)\n"
+ " movq %%mm1, 8(%0)\n"
+ " add $16, %0\n"
+ " add $16, %1\n"
+ " add $8, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+ );
+
+}
+OIL_DEFINE_IMPL_FULL (add_s16_u8_mmx, add_s16_u8, OIL_IMPL_FLAG_MMX);
+
+void
+subtract_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int n)
+{
+ while(n&3) {
+ d1[0] = s1[0] - s2[0];
+ d1++;
+ s1++;
+ s2++;
+ n--;
+ }
+ n>>=2;
+ if (n==0) return;
+ asm volatile ("\n"
+ "1:\n"
+ " movq 0(%1), %%mm0\n"
+ " psubw 0(%2), %%mm0\n"
+ " movq %%mm0, 0(%0)\n"
+ " add $8, %0\n"
+ " add $8, %1\n"
+ " add $8, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+ );
+
+}
+OIL_DEFINE_IMPL_FULL (subtract_s16_mmx, subtract_s16, OIL_IMPL_FLAG_MMX);
+
+void
+subtract_s16_u8_mmx(int16_t *d1, int16_t *s1, uint8_t *s2, int n)
+{
+ while(n&7) {
+ d1[0] = s1[0] - s2[0];
+ d1++;
+ s1++;
+ s2++;
+ n--;
+ }
+ n>>=3;
+ if (n==0) return;
+ asm volatile ("\n"
+ " pxor %%mm7, %%mm7\n"
+ "1:\n"
+ " movq 0(%2), %%mm0\n"
+ " movq 0(%2), %%mm1\n"
+ " punpcklbw %%mm7, %%mm0\n"
+ " punpckhbw %%mm7, %%mm1\n"
+ " movq 0(%1), %%mm2\n"
+ " psubw %%mm0, %%mm2\n"
+ " movq 8(%1), %%mm3\n"
+ " psubw %%mm1, %%mm3\n"
+ " movq %%mm2, 0(%0)\n"
+ " movq %%mm3, 8(%0)\n"
+ " add $16, %0\n"
+ " add $16, %1\n"
+ " add $8, %2\n"
+ " decl %3\n"
+ " jnz 1b\n"
+ " emms\n"
+ : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n)
+ );
+
+}
+OIL_DEFINE_IMPL_FULL (subtract_s16_u8_mmx, subtract_s16_u8, OIL_IMPL_FLAG_MMX);
+
diff --git a/liboil/i386_amd64/sad8x8.c b/liboil/i386_amd64/sad8x8.c
new file mode 100644
index 0000000..34cf96d
--- /dev/null
+++ b/liboil/i386_amd64/sad8x8.c
@@ -0,0 +1,490 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilclasses.h>
+#include <stddef.h>
+
+
+static void
+sad8x8_u8_mmx (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+#define LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " movq %%mm0, %%mm2 \n\t" \
+ \
+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \
+ " movq %%mm0, %%mm1 \n\t" \
+ \
+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $32, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq %%mm7, %%mm0 \n\t"
+ " psrlq $16, %%mm7 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+ " andl $0xffff, %0 \n\t"
+ " emms \n\t"
+
+ : "=m" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmx, sad8x8_u8, OIL_IMPL_FLAG_MMX);
+
+static void
+sad8x8_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+#define LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+sad8x8_u8_mmxext_2 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+#define LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " psadbw (%2), %%mm0 \n\t" \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " psadbw (%2), %%mm0 \n\t" \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_2, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+sad8x8_u8_mmxext_3 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " psadbw (%2), %%mm0 \n\t"
+ " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */
+ " psadbw (%2,%4), %%mm1 \n\t"
+ " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " psadbw (%2), %%mm0 \n\t"
+ " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */
+ " psadbw (%2,%4), %%mm1 \n\t"
+ " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " psadbw (%2), %%mm0 \n\t"
+ " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */
+ " psadbw (%2,%4), %%mm1 \n\t"
+ " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " psadbw (%2), %%mm0 \n\t"
+ " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */
+ " psadbw (%2,%4), %%mm1 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
+
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_3, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+sad8x8_u8_mmxext_4 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " movq (%1), %%mm6 \n\t"
+ " movq (%1,%3,1), %%mm7 \n\t"
+ " psadbw (%2), %%mm6 \n\t"
+ " psadbw (%2,%4,1), %%mm7 \n\t"
+ " movq (%1,%3,2), %%mm0 \n\t"
+ " movq (%1,%3,4), %%mm1 \n\t"
+ " psadbw (%2,%4,2), %%mm0 \n\t"
+ " psadbw (%2,%4,4), %%mm1 \n\t"
+ " paddw %%mm0, %%mm6 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+
+ " lea (%1,%3,8), %1 \n\t"
+ " lea (%2,%4,8), %2 \n\t"
+ " neg %3\n\t"
+ " neg %4\n\t"
+ " lea (%1,%3), %1 \n\t"
+ " lea (%2,%4), %2 \n\t"
+
+ " movq (%1), %%mm0 \n\t"
+ " movq (%1,%3,1), %%mm1 \n\t"
+ " psadbw (%2), %%mm0 \n\t"
+ " psadbw (%2,%4,1), %%mm1 \n\t"
+ " paddw %%mm0, %%mm6 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+ " movq (%1,%3,2), %%mm0 \n\t"
+ " movq (%1,%3,4), %%mm1 \n\t"
+ " psadbw (%2,%4,2), %%mm0 \n\t"
+ " psadbw (%2,%4,4), %%mm1 \n\t"
+ " paddw %%mm0, %%mm6 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+
+ " paddw %%mm6, %%mm7 \n\t"
+ " movd %%mm7, %0 \n\t"
+
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_4, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+sad8x8_8xn_u8_psadbw (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2, int n)
+{
+ int n_tmp = n;
+ uint32_t *dest_tmp = dest;
+ uint8_t *src2_tmp = src2;
+ uint8_t *src1_tmp = src1;
+
+ __asm__ __volatile__ ("\n"
+ "1:\n"
+ " movq (%[src1]), %%mm7 \n\t"
+ " psadbw (%[src2]), %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],1), %%mm1 \n\t"
+ " psadbw (%[src2],%[sstr2],1), %%mm1 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],2), %%mm0 \n\t"
+ " psadbw (%[src2],%[sstr2],2), %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],4), %%mm1 \n\t"
+ " psadbw (%[src2],%[sstr2],4), %%mm1 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+ " movd %%mm7, 0(%[dest]) \n\t"
+ " add %[sstr2],%[src2]\n\t"
+ " add $4, %[dest]\n\t"
+ " decl %[n]\n\t"
+ " jnz 1b\n\t"
+ " emms \n\t"
+ : [src1] "+r" (src1_tmp),
+ [src2] "+r" (src2_tmp),
+ [n] "+m" (n_tmp),
+ [dest] "+r" (dest_tmp)
+ : [sstr1] "r" ((ptrdiff_t)(sstr1)),
+ [sstr2] "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+
+ src1 = OIL_OFFSET(src1, 7 * sstr1);
+ src2 = OIL_OFFSET(src2, 7 * sstr2);
+ sstr1 = -sstr1;
+ sstr2 = -sstr2;
+
+ __asm__ __volatile__ ("\n"
+#ifdef __i386__
+ " pushl %%ebx\n\t"
+#endif
+ "1:\n"
+ " movq (%[src1]), %%mm7 \n\t"
+ " psadbw (%[src2]), %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],1), %%mm1 \n\t"
+ " psadbw (%[src2],%[sstr2],1), %%mm1 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],2), %%mm0 \n\t"
+ " psadbw (%[src2],%[sstr2],2), %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t"
+ " movq (%[src1],%[sstr1],4), %%mm1 \n\t"
+ " psadbw (%[src2],%[sstr2],4), %%mm1 \n\t"
+ " paddw %%mm1, %%mm7 \n\t"
+ " movd %%mm7, %%ebx\n\t"
+ " addl %%ebx, 0(%[dest])\n\t"
+ " sub %[sstr2],%[src2]\n\t"
+ " add $4, %[dest]\n\t"
+ " decl %[n]\n\t"
+ " jnz 1b\n\t"
+#ifdef __i386__
+ " popl %%ebx\n\t"
+#endif
+
+ " emms \n\t"
+ : [src1] "+r" (src1),
+ [src2] "+r" (src2),
+ [dest] "+r" (dest),
+ [n] "+m" (n)
+ : [sstr1] "r" ((ptrdiff_t)(sstr1)),
+ [sstr2] "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+}
+OIL_DEFINE_IMPL_FULL (sad8x8_8xn_u8_psadbw, sad8x8_8xn_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+
+static void
+sad12x12_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+ " pxor %%mm2, %%mm2 \n\t"
+ " pxor %%mm3, %%mm3 \n\t"
+
+#define LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " movd 8(%1), %%mm2 \n\t" /* take 4 bytes */ \
+ " movd 8(%2), %%mm3 \n\t" \
+ " psadbw %%mm3, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movd 8(%1), %%mm2 \n\t" /* take 4 bytes */
+ " movd 8(%2), %%mm3 \n\t"
+ " psadbw %%mm3, %%mm2 \n\t"
+ " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad12x12_u8_mmxext, sad12x12_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+
+static void
+sad16x16_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2,
+ int sstr2)
+{
+ uint32_t diff;
+
+ __asm__ __volatile__ (
+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
+
+#define LOOP \
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
+ " movq (%2), %%mm1 \n\t" \
+ " psadbw %%mm1, %%mm0 \n\t" \
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
+ " movq 8(%1), %%mm2 \n\t" /* take 8 bytes */ \
+ " movq 8(%2), %%mm3 \n\t" \
+ " psadbw %%mm3, %%mm2 \n\t" \
+ " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ \
+ " add %3, %1 \n\t" /* Inc pointer into the new data */ \
+ " add %4, %2 \n\t" /* Inc pointer into ref data */
+
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+ LOOP
+#undef LOOP
+
+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */
+ " movq (%2), %%mm1 \n\t"
+ " psadbw %%mm1, %%mm0 \n\t"
+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
+ " movq 8(%1), %%mm2 \n\t" /* take 8 bytes */
+ " movq 8(%2), %%mm3 \n\t"
+ " psadbw %%mm3, %%mm2 \n\t"
+ " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */
+ " movd %%mm7, %0 \n\t"
+ " emms \n\t"
+
+ : "=r" (diff),
+ "+r" (src1),
+ "+r" (src2)
+ : "r" ((ptrdiff_t)(sstr1)),
+ "r" ((ptrdiff_t)(sstr2))
+ : "memory"
+ );
+ *dest = diff;
+}
+OIL_DEFINE_IMPL_FULL (sad16x16_u8_mmxext, sad16x16_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
+