diff options
author | David Schleef <ds@schleef.org> | 2007-06-13 23:40:02 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2007-06-13 23:40:02 +0000 |
commit | aea34d1babca1373a1a32f6d7162d52c41193604 (patch) | |
tree | 7422e0cb443b3124145a7258868bcfa2aa276a2d /liboil/i386_amd64 | |
parent | a65707e581b676152474147439579750c11d459c (diff) | |
download | liboil-aea34d1babca1373a1a32f6d7162d52c41193604.tar.gz |
* liboil/amd64/wavelet.c:
* liboil/i386/Makefile.am:
* liboil/i386/convert_i386.c:
* liboil/i386/math.c:
* liboil/i386/sad8x8_i386.c:
* liboil/i386_amd64/Makefile.am:
* liboil/i386_amd64/convert.c:
* liboil/i386_amd64/copy.c:
* liboil/i386_amd64/math.c:
* liboil/i386_amd64/sad8x8.c:
Convert a bunch of stuff to dual-arch.
Diffstat (limited to 'liboil/i386_amd64')
-rw-r--r-- | liboil/i386_amd64/Makefile.am | 4 | ||||
-rw-r--r-- | liboil/i386_amd64/convert.c | 137 | ||||
-rw-r--r-- | liboil/i386_amd64/copy.c | 435 | ||||
-rw-r--r-- | liboil/i386_amd64/math.c | 134 | ||||
-rw-r--r-- | liboil/i386_amd64/sad8x8.c | 490 |
5 files changed, 1200 insertions, 0 deletions
diff --git a/liboil/i386_amd64/Makefile.am b/liboil/i386_amd64/Makefile.am index f5412cc..89dc2f6 100644 --- a/liboil/i386_amd64/Makefile.am +++ b/liboil/i386_amd64/Makefile.am @@ -3,9 +3,13 @@ noinst_LTLIBRARIES = libi386_amd64.la libi386_amd64_la_SOURCES = \ clamp.c \ + convert.c \ + copy.c \ idct8x8_i386.c \ + math.c \ mt19937.c \ resample.c \ + sad8x8.c \ sum.c \ swab.c diff --git a/liboil/i386_amd64/convert.c b/liboil/i386_amd64/convert.c new file mode 100644 index 0000000..cf8e126 --- /dev/null +++ b/liboil/i386_amd64/convert.c @@ -0,0 +1,137 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2006 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include <liboil/liboilfunction.h> +#include <liboil/liboilclasses.h> + +static void +convert_u8_s16_mmx (uint8_t * dest, const int16_t * src, int n) +{ + while(n&7) { + int x; + x = src[0]; + if (x<0) x = 0; + if (x>255) x = 255; + dest[0] = x; + src++; + dest++; + n--; + } + + n>>=3; + __asm__ __volatile__ ("\n" + "1:\n" + " movq 0(%1), %%mm0\n" + " packuswb 8(%1), %%mm0\n" + " movq %%mm0, 0(%0)\n" + " add $16, %1\n" + " add $8, %0\n" + " decl %2\n" + " jg 1b\n" + " emms\n" + : "+r" (dest), "+r" (src), "+r" (n)); +} +OIL_DEFINE_IMPL_FULL (convert_u8_s16_mmx, convert_u8_s16, OIL_IMPL_FLAG_MMX); + +static void +convert_u8_s16_mmx_2 (uint8_t * dest, const int16_t * src, int n) +{ + while(n&7) { + int x; + x = src[0]; + if (x<0) x = 0; + if (x>255) x = 255; + dest[0] = x; + src++; + dest++; + n--; + } + if (n==0) return; + + n>>=3; + if (n&1) { + __asm__ __volatile__ ("\n" + " movq 0(%1), %%mm0\n" + " packuswb 8(%1), %%mm0\n" + " movq %%mm0, 0(%0)\n" + " add $16, %1\n" + " add $8, %0\n" + : "+r" (dest), "+r" (src), "+r" (n)); + } + + n >>= 1; + if (n > 0) { + __asm__ __volatile__ ("\n" + "2:\n" + " movq 0(%1), %%mm0\n" + " packuswb 8(%1), %%mm0\n" + " movq %%mm0, 0(%0)\n" + " movq 16(%1), %%mm0\n" + " packuswb 24(%1), %%mm0\n" + " movq %%mm0, 8(%0)\n" + " add $32, %1\n" + " add $16, %0\n" + " decl %2\n" + " jg 2b\n" + : "+r" (dest), "+r" (src), "+r" (n)); + } + __asm__ __volatile__ ("emms\n"); +} +OIL_DEFINE_IMPL_FULL (convert_u8_s16_mmx_2, convert_u8_s16, OIL_IMPL_FLAG_MMX); + +static void +convert_s16_u8_mmx (int16_t * dest, const uint8_t * src, int n) +{ + while(n&7) { + dest[0] = src[0]; + src++; + dest++; + n--; + } + + n>>=3; + __asm__ __volatile__ ("\n" + " pxor %%mm0, %%mm0\n" + "1:\n" + " movd 0(%1), %%mm1\n" + " punpcklbw %%mm0, %%mm1\n" + " movq %%mm1, 0(%0)\n" + " movd 4(%1), %%mm2\n" + " punpcklbw %%mm0, %%mm2\n" + " movq %%mm2, 8(%0)\n" + " add $8, %1\n" + " add $16, %0\n" + " decl %2\n" + " jg 1b\n" + " emms\n" + : "+r" (dest), "+r" (src), "+r" (n)); +} +OIL_DEFINE_IMPL_FULL (convert_s16_u8_mmx, convert_s16_u8, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/i386_amd64/copy.c b/liboil/i386_amd64/copy.c new file mode 100644 index 0000000..9c7f55a --- /dev/null +++ b/liboil/i386_amd64/copy.c @@ -0,0 +1,435 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilclasses.h> +#include <stddef.h> + + +static void +copy_u8_i386_mmx (uint8_t *dest, uint8_t *src, int n) +{ + ptrdiff_t i = 0; + + if (n&4) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + while(n&0x7) { + *dest++ = *src++; + n--; + } + if (n) asm volatile ( + "1:\n" + " movq (%[src],%[i]), %%mm0\n" + " movq %%mm0, (%[dest],%[i])\n" + " add $8, %[i]\n" + " decl %[n]\n" + " jne 1b\n" + " emms\n" + : [dest] "+r" (dest), + [src] "+r" (src), + [i] "+r" (i) + : [n] "c" (n>>3)); +} +OIL_DEFINE_IMPL_FULL (copy_u8_i386_mmx, copy_u8, OIL_IMPL_FLAG_MMX); + +static void +copy_u8_mmx2 (uint8_t *dest, uint8_t *src, int n) +{ + ptrdiff_t i = 0; + + while (n&0xc) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + while(n&0xf) { + *dest++ = *src++; + n--; + } + if (n) asm volatile ( + "1:\n" + " movq (%[src],%[i]), %%mm0\n" + " movq %%mm0, (%[dest],%[i])\n" + " movq 8(%[src],%[i]), %%mm1\n" + " movq %%mm1, 8(%[dest],%[i])\n" + " add $16, %[i]\n" + " decl %[n]\n" + " jne 1b\n" + " emms\n" + : [dest] "+r" (dest), + [src] "+r" (src), + [i] "+r" (i) + : [n] "c" (n>>4)); +} +OIL_DEFINE_IMPL_FULL (copy_u8_mmx2, copy_u8, OIL_IMPL_FLAG_MMX); + +#if 0 +static void +copy_u8_mmx3 (uint8_t *dest, uint8_t *src, int n) +{ + /* make sure destination is cache-line aligned for output */ + if (n < 64) { + while (n>0) { + *dest++ = *src++; + n--; + } + return; + } + while (((unsigned long)dest) & 0x3) { + *dest++ = *src++; + n--; + } + while (((unsigned long)dest) & 0x3f) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + if (n > 64) asm volatile ( + " mov $0, %%eax\n" + "1:\n" + //" prefetchnta 128(%1,%%eax)\n" + " movq (%1,%%eax), %%mm0\n" + " movq 8(%1,%%eax), %%mm1\n" + " movq 16(%1,%%eax), %%mm2\n" + " movq 24(%1,%%eax), %%mm3\n" + " movq 32(%1,%%eax), %%mm4\n" + " movq 40(%1,%%eax), %%mm5\n" + " movq 48(%1,%%eax), %%mm6\n" + " movq 56(%1,%%eax), %%mm7\n" + " movntq %%mm0, (%0,%%eax)\n" + " movntq %%mm1, 8(%0,%%eax)\n" + " movntq %%mm2, 16(%0,%%eax)\n" + " movntq %%mm3, 24(%0,%%eax)\n" + " movntq %%mm4, 32(%0,%%eax)\n" + " movntq %%mm5, 40(%0,%%eax)\n" + " movntq %%mm6, 48(%0,%%eax)\n" + " movntq %%mm7, 56(%0,%%eax)\n" + " add $64, %%eax\n" + " decl %%ecx\n" + " jne 1b\n" + " sfence\n" + " emms\n" + : "+r" (dest), "+r" (src) + : "c" (n>>6) + : "eax"); + + dest += n&(~(0x3f)); + src += n&(~(0x3f)); + n &= 0x3f; + while (n > 3) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + while (n > 0) { + *dest++ = *src++; + n--; + } +} +OIL_DEFINE_IMPL_FULL (copy_u8_mmx3, copy_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); +#endif + +#if 0 +static void +copy_u8_mmx4 (uint8_t *dest, uint8_t *src, int n) +{ + /* make sure destination is cache-line aligned for output */ + if (n < 32) { + while (n>0) { + *dest++ = *src++; + n--; + } + return; + } + while (((unsigned long)dest) & 0x3) { + *dest++ = *src++; + n--; + } + while (((unsigned long)dest) & 0x1f) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + if (n > 32) asm volatile ( + " mov $0, %%eax\n" + "1:\n" + //" prefetchnta 128(%1,%%eax)\n" + " movq (%1,%%eax), %%mm0\n" + " movq 8(%1,%%eax), %%mm1\n" + " movq 16(%1,%%eax), %%mm2\n" + " movq 24(%1,%%eax), %%mm3\n" + " movntq %%mm0, (%0,%%eax)\n" + " movntq %%mm1, 8(%0,%%eax)\n" + " movntq %%mm2, 16(%0,%%eax)\n" + " movntq %%mm3, 24(%0,%%eax)\n" + " add $32, %%eax\n" + " decl %%ecx\n" + " jne 1b\n" + " sfence\n" + " emms\n" + : "+r" (dest), "+r" (src) + : "c" (n>>5) + : "eax"); + + dest += n&(~(0x1f)); + src += n&(~(0x1f)); + n &= 0x1f; + while (n > 3) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + while (n > 0) { + *dest++ = *src++; + n--; + } +} +OIL_DEFINE_IMPL_FULL (copy_u8_mmx4, copy_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); +#endif + +static void +copy_u8_mmx5 (uint8_t *dest, uint8_t *src, int n) +{ + ptrdiff_t i = 0; + + while (n&0xc) { + *(uint32_t *)dest = *(uint32_t *)src; + dest += 4; + src += 4; + n-=4; + } + while(n&0xf) { + *dest++ = *src++; + n--; + } + if (n) asm volatile ( + "1:\n" + " movq (%[src],%[i]), %%mm0\n" + " movq 8(%[src],%[i]), %%mm1\n" + " movq %%mm0, (%[dest],%[i])\n" + " movq %%mm1, 8(%[dest],%[i])\n" + " add $16, %[i]\n" + " decl %[n]\n" + " jne 1b\n" + " emms\n" + : [dest] "+r" (dest), + [src] "+r" (src), + [i] "+r" (i) + : [n] "c" (n>>4)); +} +OIL_DEFINE_IMPL_FULL (copy_u8_mmx5, copy_u8, OIL_IMPL_FLAG_MMX); + + +static void splat_u8_ns_i386_mmx (uint8_t *dest, const uint8_t *param, int n) +{ + uint32_t p; + while(n&7) { + *dest = *param; + dest++; + n--; + } + if (n==0) return; + n >>= 3; + p = (*param<<24) | (*param<<16) | (*param<<8) | (*param); + asm volatile ( + " movd %2, %%mm0\n" + " punpcklbw %%mm0, %%mm0\n" + "1:\n" + " movq %%mm0, (%0)\n" + " add $8, %0\n" + " decl %1\n" + " jnz 1b\n" + " emms\n" + : "+r" (dest), "+r" (n), "+r" (p)); +} +OIL_DEFINE_IMPL_FULL (splat_u8_ns_i386_mmx, splat_u8_ns, OIL_IMPL_FLAG_MMX); + +static void splat_u8_ns_mmx2 (uint8_t *dest, const uint8_t *param, int n) +{ + uint32_t p; + while(n&15) { + *dest = *param; + dest++; + n--; + } + if (n==0) return; + n >>= 4; + p = (*param<<24) | (*param<<16) | (*param<<8) | (*param); + asm volatile ( + " movd %2, %%mm0\n" + " punpcklbw %%mm0, %%mm0\n" + "1:\n" + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" + " add $16, %0\n" + " decl %1\n" + " jnz 1b\n" + " emms\n" + : "+r" (dest), "+r" (n), "+r" (p)); +} +OIL_DEFINE_IMPL_FULL(splat_u8_ns_mmx2, splat_u8_ns, OIL_IMPL_FLAG_MMX); + +static void splat_u8_ns_mmx2a (uint8_t *dest, const uint8_t *param, int n) +{ + uint32_t p; + ptrdiff_t tmp; + + p = *param; + p |= p<<8; + p |= p<<16; + if (n<16) { + while(n>0) { + *dest = *param; + dest++; + n--; + } + return; + } + asm volatile ( + " movd %2, %%mm0\n" + " punpcklbw %%mm0, %%mm0\n" + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" +#ifdef __i386__ + " mov %1, %[tmp]\n" +#elif defined(__amd64__) + /* ugh this is gross */ + " mov %1, %%eax\n" + " cltq\n" +#else +#error +#endif + " and $0xf, %[tmp]\n" + " add %[tmp], %0\n" + " shr $4, %1\n" + "1:\n" + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" + " add $16, %0\n" + " decl %1\n" + " jnz 1b\n" + " emms\n" + : "+r" (dest), + "+r" (n), + "+r" (p), + [tmp] "=a" (tmp) + : ); +} +OIL_DEFINE_IMPL_FULL(splat_u8_ns_mmx2a, splat_u8_ns, OIL_IMPL_FLAG_MMX); + +static void splat_u16_ns_mmx (uint16_t *dest, const uint16_t *src, int n) +{ + while(n&3) { + *dest = *src; + dest++; + n--; + } + if (n==0) return; + n >>= 2; + asm volatile ( + " movzwl 0(%[src]), %%ecx\n" + " movd %%ecx, %%mm0\n" + " pshufw $00, %%mm0, %%mm0\n" + "1:\n" + " movq %%mm0, (%[dest])\n" + " add $8, %0\n" + " decl %[n]\n" + " jnz 1b\n" + " emms\n" + : [dest] "+r" (dest), + [n] "+r" (n) + : [src] "r" (src)); +} +OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx, splat_u16_ns, OIL_IMPL_FLAG_MMX); + +static void splat_u16_ns_mmx_2 (uint16_t *dest, const uint16_t *src, int n) +{ + while(n&7) { + *dest = *src; + dest++; + n--; + } + if (n==0) return; + n >>= 3; + asm volatile ( + " movzwl 0(%[src]), %%ecx\n" + " movd %%ecx, %%mm0\n" + " pshufw $00, %%mm0, %%mm0\n" + "1:\n" + " movq %%mm0, 0(%[dest])\n" + " movq %%mm0, 8(%[dest])\n" + " add $16, %0\n" + " decl %[n]\n" + " jnz 1b\n" + " emms\n" + : [dest] "+r" (dest), + [n] "+r" (n) + : [src] "r" (src)); +} +OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx_2, splat_u16_ns, OIL_IMPL_FLAG_MMX); + +static void splat_u16_ns_mmx_3 (uint16_t *dest, const uint16_t *src, int n) +{ + while(n&15) { + *dest = *src; + dest++; + n--; + } + if (n==0) return; + n >>= 4; + asm volatile ( + " movzwl 0(%[src]), %%ecx\n" + " movd %%ecx, %%mm0\n" + " pshufw $00, %%mm0, %%mm0\n" + "1:\n" + " movq %%mm0, 0(%[dest])\n" + " movq %%mm0, 8(%[dest])\n" + " movq %%mm0, 16(%[dest])\n" + " movq %%mm0, 24(%[dest])\n" + " add $32, %0\n" + " decl %[n]\n" + " jnz 1b\n" + " emms\n" + : [dest] "+r" (dest), + [n] "+r" (n) + : [src] "r" (src)); +} +OIL_DEFINE_IMPL_FULL (splat_u16_ns_mmx_3, splat_u16_ns, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/i386_amd64/math.c b/liboil/i386_amd64/math.c new file mode 100644 index 0000000..3db6112 --- /dev/null +++ b/liboil/i386_amd64/math.c @@ -0,0 +1,134 @@ + +#include <liboil/liboilfunction.h> +#include <liboil/liboilclasses.h> + +void +add_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int n) +{ + while(n&3) { + d1[0] = s1[0] + s2[0]; + d1++; + s1++; + s2++; + n--; + } + n>>=2; + if (n==0) return; + asm volatile ("\n" + "1:\n" + " movq 0(%2), %%mm0\n" + " paddw 0(%1), %%mm0\n" + " movq %%mm0, 0(%0)\n" + " add $8, %0\n" + " add $8, %1\n" + " add $8, %2\n" + " decl %3\n" + " jnz 1b\n" + " emms\n" + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n) + ); + +} +OIL_DEFINE_IMPL_FULL (add_s16_mmx, add_s16, OIL_IMPL_FLAG_MMX); + +void +add_s16_u8_mmx(int16_t *d1, int16_t *s1, uint8_t *s2, int n) +{ + while(n&7) { + d1[0] = s1[0] + s2[0]; + d1++; + s1++; + s2++; + n--; + } + n>>=3; + if (n==0) return; + asm volatile ("\n" + " pxor %%mm7, %%mm7\n" + "1:\n" + " movq 0(%2), %%mm0\n" + " movq 0(%2), %%mm1\n" + " punpcklbw %%mm7, %%mm0\n" + " punpckhbw %%mm7, %%mm1\n" + " paddw 0(%1), %%mm0\n" + " paddw 8(%1), %%mm1\n" + " movq %%mm0, 0(%0)\n" + " movq %%mm1, 8(%0)\n" + " add $16, %0\n" + " add $16, %1\n" + " add $8, %2\n" + " decl %3\n" + " jnz 1b\n" + " emms\n" + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n) + ); + +} +OIL_DEFINE_IMPL_FULL (add_s16_u8_mmx, add_s16_u8, OIL_IMPL_FLAG_MMX); + +void +subtract_s16_mmx(int16_t *d1, int16_t *s1, int16_t *s2, int n) +{ + while(n&3) { + d1[0] = s1[0] - s2[0]; + d1++; + s1++; + s2++; + n--; + } + n>>=2; + if (n==0) return; + asm volatile ("\n" + "1:\n" + " movq 0(%1), %%mm0\n" + " psubw 0(%2), %%mm0\n" + " movq %%mm0, 0(%0)\n" + " add $8, %0\n" + " add $8, %1\n" + " add $8, %2\n" + " decl %3\n" + " jnz 1b\n" + " emms\n" + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n) + ); + +} +OIL_DEFINE_IMPL_FULL (subtract_s16_mmx, subtract_s16, OIL_IMPL_FLAG_MMX); + +void +subtract_s16_u8_mmx(int16_t *d1, int16_t *s1, uint8_t *s2, int n) +{ + while(n&7) { + d1[0] = s1[0] - s2[0]; + d1++; + s1++; + s2++; + n--; + } + n>>=3; + if (n==0) return; + asm volatile ("\n" + " pxor %%mm7, %%mm7\n" + "1:\n" + " movq 0(%2), %%mm0\n" + " movq 0(%2), %%mm1\n" + " punpcklbw %%mm7, %%mm0\n" + " punpckhbw %%mm7, %%mm1\n" + " movq 0(%1), %%mm2\n" + " psubw %%mm0, %%mm2\n" + " movq 8(%1), %%mm3\n" + " psubw %%mm1, %%mm3\n" + " movq %%mm2, 0(%0)\n" + " movq %%mm3, 8(%0)\n" + " add $16, %0\n" + " add $16, %1\n" + " add $8, %2\n" + " decl %3\n" + " jnz 1b\n" + " emms\n" + : "+r" (d1), "+r" (s1), "+r" (s2), "+r" (n) + ); + +} +OIL_DEFINE_IMPL_FULL (subtract_s16_u8_mmx, subtract_s16_u8, OIL_IMPL_FLAG_MMX); + diff --git a/liboil/i386_amd64/sad8x8.c b/liboil/i386_amd64/sad8x8.c new file mode 100644 index 0000000..34cf96d --- /dev/null +++ b/liboil/i386_amd64/sad8x8.c @@ -0,0 +1,490 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilclasses.h> +#include <stddef.h> + + +static void +sad8x8_u8_mmx (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + +#define LOOP \ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \ + " movq (%2), %%mm1 \n\t" \ + " movq %%mm0, %%mm2 \n\t" \ + \ + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \ + " movq %%mm0, %%mm1 \n\t" \ + \ + " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \ + " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \ + " add %3, %1 \n\t" /* Inc pointer into the new data */ \ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq %%mm7, %%mm0 \n\t" + " psrlq $16, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + " emms \n\t" + + : "=m" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmx, sad8x8_u8, OIL_IMPL_FLAG_MMX); + +static void +sad8x8_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + +#define LOOP \ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \ + " movq (%2), %%mm1 \n\t" \ + " psadbw %%mm1, %%mm0 \n\t" \ + " add %3, %1 \n\t" /* Inc pointer into the new data */ \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +sad8x8_u8_mmxext_2 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + +#define LOOP \ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \ + " psadbw (%2), %%mm0 \n\t" \ + " add %3, %1 \n\t" /* Inc pointer into the new data */ \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " psadbw (%2), %%mm0 \n\t" \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_2, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +sad8x8_u8_mmxext_3 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " psadbw (%2), %%mm0 \n\t" + " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */ + " psadbw (%2,%4), %%mm1 \n\t" + " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " psadbw (%2), %%mm0 \n\t" + " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */ + " psadbw (%2,%4), %%mm1 \n\t" + " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " psadbw (%2), %%mm0 \n\t" + " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */ + " psadbw (%2,%4), %%mm1 \n\t" + " lea (%1,%3,2), %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " lea (%2,%4,2), %2 \n\t" /* Inc pointer into ref data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " psadbw (%2), %%mm0 \n\t" + " movq (%1,%3), %%mm1 \n\t" /* take 8 bytes */ + " psadbw (%2,%4), %%mm1 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_3, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +sad8x8_u8_mmxext_4 (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " movq (%1), %%mm6 \n\t" + " movq (%1,%3,1), %%mm7 \n\t" + " psadbw (%2), %%mm6 \n\t" + " psadbw (%2,%4,1), %%mm7 \n\t" + " movq (%1,%3,2), %%mm0 \n\t" + " movq (%1,%3,4), %%mm1 \n\t" + " psadbw (%2,%4,2), %%mm0 \n\t" + " psadbw (%2,%4,4), %%mm1 \n\t" + " paddw %%mm0, %%mm6 \n\t" + " paddw %%mm1, %%mm7 \n\t" + + " lea (%1,%3,8), %1 \n\t" + " lea (%2,%4,8), %2 \n\t" + " neg %3\n\t" + " neg %4\n\t" + " lea (%1,%3), %1 \n\t" + " lea (%2,%4), %2 \n\t" + + " movq (%1), %%mm0 \n\t" + " movq (%1,%3,1), %%mm1 \n\t" + " psadbw (%2), %%mm0 \n\t" + " psadbw (%2,%4,1), %%mm1 \n\t" + " paddw %%mm0, %%mm6 \n\t" + " paddw %%mm1, %%mm7 \n\t" + " movq (%1,%3,2), %%mm0 \n\t" + " movq (%1,%3,4), %%mm1 \n\t" + " psadbw (%2,%4,2), %%mm0 \n\t" + " psadbw (%2,%4,4), %%mm1 \n\t" + " paddw %%mm0, %%mm6 \n\t" + " paddw %%mm1, %%mm7 \n\t" + + " paddw %%mm6, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad8x8_u8_mmxext_4, sad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +sad8x8_8xn_u8_psadbw (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2, int n) +{ + int n_tmp = n; + uint32_t *dest_tmp = dest; + uint8_t *src2_tmp = src2; + uint8_t *src1_tmp = src1; + + __asm__ __volatile__ ("\n" + "1:\n" + " movq (%[src1]), %%mm7 \n\t" + " psadbw (%[src2]), %%mm7 \n\t" + " movq (%[src1],%[sstr1],1), %%mm1 \n\t" + " psadbw (%[src2],%[sstr2],1), %%mm1 \n\t" + " paddw %%mm1, %%mm7 \n\t" + " movq (%[src1],%[sstr1],2), %%mm0 \n\t" + " psadbw (%[src2],%[sstr2],2), %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq (%[src1],%[sstr1],4), %%mm1 \n\t" + " psadbw (%[src2],%[sstr2],4), %%mm1 \n\t" + " paddw %%mm1, %%mm7 \n\t" + " movd %%mm7, 0(%[dest]) \n\t" + " add %[sstr2],%[src2]\n\t" + " add $4, %[dest]\n\t" + " decl %[n]\n\t" + " jnz 1b\n\t" + " emms \n\t" + : [src1] "+r" (src1_tmp), + [src2] "+r" (src2_tmp), + [n] "+m" (n_tmp), + [dest] "+r" (dest_tmp) + : [sstr1] "r" ((ptrdiff_t)(sstr1)), + [sstr2] "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + + src1 = OIL_OFFSET(src1, 7 * sstr1); + src2 = OIL_OFFSET(src2, 7 * sstr2); + sstr1 = -sstr1; + sstr2 = -sstr2; + + __asm__ __volatile__ ("\n" +#ifdef __i386__ + " pushl %%ebx\n\t" +#endif + "1:\n" + " movq (%[src1]), %%mm7 \n\t" + " psadbw (%[src2]), %%mm7 \n\t" + " movq (%[src1],%[sstr1],1), %%mm1 \n\t" + " psadbw (%[src2],%[sstr2],1), %%mm1 \n\t" + " paddw %%mm1, %%mm7 \n\t" + " movq (%[src1],%[sstr1],2), %%mm0 \n\t" + " psadbw (%[src2],%[sstr2],2), %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq (%[src1],%[sstr1],4), %%mm1 \n\t" + " psadbw (%[src2],%[sstr2],4), %%mm1 \n\t" + " paddw %%mm1, %%mm7 \n\t" + " movd %%mm7, %%ebx\n\t" + " addl %%ebx, 0(%[dest])\n\t" + " sub %[sstr2],%[src2]\n\t" + " add $4, %[dest]\n\t" + " decl %[n]\n\t" + " jnz 1b\n\t" +#ifdef __i386__ + " popl %%ebx\n\t" +#endif + + " emms \n\t" + : [src1] "+r" (src1), + [src2] "+r" (src2), + [dest] "+r" (dest), + [n] "+m" (n) + : [sstr1] "r" ((ptrdiff_t)(sstr1)), + [sstr2] "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); +} +OIL_DEFINE_IMPL_FULL (sad8x8_8xn_u8_psadbw, sad8x8_8xn_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + + +static void +sad12x12_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + " pxor %%mm2, %%mm2 \n\t" + " pxor %%mm3, %%mm3 \n\t" + +#define LOOP \ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \ + " movq (%2), %%mm1 \n\t" \ + " psadbw %%mm1, %%mm0 \n\t" \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \ + " movd 8(%1), %%mm2 \n\t" /* take 4 bytes */ \ + " movd 8(%2), %%mm3 \n\t" \ + " psadbw %%mm3, %%mm2 \n\t" \ + " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ \ + " add %3, %1 \n\t" /* Inc pointer into the new data */ \ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movd 8(%1), %%mm2 \n\t" /* take 4 bytes */ + " movd 8(%2), %%mm3 \n\t" + " psadbw %%mm3, %%mm2 \n\t" + " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad12x12_u8_mmxext, sad12x12_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + +static void +sad16x16_u8_mmxext (uint32_t * dest, uint8_t * src1, int sstr1, uint8_t * src2, + int sstr2) +{ + uint32_t diff; + + __asm__ __volatile__ ( + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + +#define LOOP \ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \ + " movq (%2), %%mm1 \n\t" \ + " psadbw %%mm1, %%mm0 \n\t" \ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \ + " movq 8(%1), %%mm2 \n\t" /* take 8 bytes */ \ + " movq 8(%2), %%mm3 \n\t" \ + " psadbw %%mm3, %%mm2 \n\t" \ + " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ \ + " add %3, %1 \n\t" /* Inc pointer into the new data */ \ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP + LOOP +#undef LOOP + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movq 8(%1), %%mm2 \n\t" /* take 8 bytes */ + " movq 8(%2), %%mm3 \n\t" + " psadbw %%mm3, %%mm2 \n\t" + " paddw %%mm2, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + " emms \n\t" + + : "=r" (diff), + "+r" (src1), + "+r" (src2) + : "r" ((ptrdiff_t)(sstr1)), + "r" ((ptrdiff_t)(sstr2)) + : "memory" + ); + *dest = diff; +} +OIL_DEFINE_IMPL_FULL (sad16x16_u8_mmxext, sad16x16_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); + |