summaryrefslogtreecommitdiff
path: root/liboil/dct
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-10-02 01:59:27 +0000
committerDavid Schleef <ds@schleef.org>2005-10-02 01:59:27 +0000
commit1cc52aa34b431e333d22e53b81779a4efaee02f7 (patch)
tree1a30e0aee2b7d69f7bdc6333a134f5088d46c93c /liboil/dct
parent414196740a906a7fb35463418cf1f3996f8e3cc0 (diff)
downloadliboil-1cc52aa34b431e333d22e53b81779a4efaee02f7.tar.gz
What have I done?!? Move files around.
* liboil/colorspace/Makefile.am: * liboil/colorspace/argb_paint.c: * liboil/colorspace/argb_paint_i386.c: * liboil/colorspace/ayuv2argb.c: * liboil/colorspace/ayuv2argb_i386.c: * liboil/colorspace/composite.c: * liboil/colorspace/composite_i386.c: * liboil/colorspace/resample.c: * liboil/colorspace/resample_powerpc.c: * liboil/colorspace/rgb2bgr.c: * liboil/colorspace/rgb2bgr_powerpc.c: * liboil/colorspace/rgb2rgba.c: * liboil/colorspace/rgb2rgba_powerpc.c: * liboil/colorspace/yuv.c: * liboil/conv/Makefile.am: * liboil/conv/conv_powerpc.c: * liboil/conv/conv_ref.c: * liboil/copy/Makefile.am: * liboil/copy/copy.c: * liboil/copy/copy8x8.c: * liboil/copy/copy8x8_i386.c: * liboil/copy/copy_i386.c: * liboil/copy/copy_powerpc.c: * liboil/copy/permute.c: * liboil/copy/splat_i386.c: * liboil/copy/splat_powerpc.c: * liboil/copy/splat_ref.c: * liboil/copy/tablelookup_ref.c: * liboil/copy/trans8x8.c: * liboil/copy/trans8x8_i386.c: * liboil/dct/Makefile.am: * liboil/dct/fdct8x8theora_i386.c: * liboil/dct/idct8x8_i386.c: * liboil/i386/Makefile.am: * liboil/jpeg/Makefile.am: * liboil/jpeg/zigzag8x8_powerpc.c: * liboil/md5/Makefile.am: * liboil/md5/md5_i386.c: * liboil/md5/md5_powerpc.c: * liboil/powerpc/Makefile.am: * liboil/powerpc/abs.c: (abs_u16_s16_a16_altivec): * liboil/powerpc/clip.c: (clip_s16_ppcasm), (clip_s16_ppcasm2), (clip_s16_ppcasm3): * liboil/powerpc/conv.c: (_sl_clipconv_S8_F32__powerpc_altivec), (_sl_clipconv_S16_F32__powerpc_altivec), (_sl_clipconvert_S32_F32__powerpc_altivec), (convert_s16_f64__powerpc), (_sl_convert_S16_F32__powerpc), (conv_f64_s16_altivec), (clipconv_s16_f64_ppcasm): * liboil/powerpc/copy.c: (copy_u8_altivec), (copy_u8_altivec2): * liboil/powerpc/md5.c: (md5_asm1), (md5_asm2), (md5_asm3): * liboil/powerpc/mix.c: (mix_u8_a16_altivec): * liboil/powerpc/multsum.c: (multsum_f32_ppcasm): * liboil/powerpc/resample.c: (__attribute__), (merge_linear_argb_powerpc): * liboil/powerpc/rgb2bgr.c: (rgb2bgr_ppc), (rgb2bgr_ppc2), (rgb2bgr_ppc3), (rgb2bgr_ppc4): * liboil/powerpc/rgb2rgba.c: (rgb2rgba_powerpcasm): * liboil/powerpc/sad8x8.c: (sad8x8_s16_a16_altivec), (sad8x8_s16_l15_a16_altivec): * liboil/powerpc/splat.c: (splat_u8_ns_altivec), (splat_u8_ns_altivec2), (splat_u32_ns_altivec): * liboil/powerpc/zigzag8x8.c: (__attribute__), (zigzag8x8_s16_a16_altivec): * liboil/ref/Makefile.am: * liboil/ref/argb_paint.c: (argb_paint_u8_ref): * liboil/ref/ayuv2argb.c: (ayuv2argb_u8_ref): * liboil/ref/composite.c: (composite_test), (composite_in_argb_ref), (composite_in_argb_const_src_ref), (composite_in_argb_const_mask_ref), (composite_over_argb_ref), (composite_over_argb_const_src_ref), (composite_add_argb_ref), (composite_add_argb_const_src_ref), (composite_in_over_argb_ref), (composite_in_over_argb_const_src_ref), (composite_in_over_argb_const_mask_ref), (composite_add_u8_ref), (composite_over_u8_ref): * liboil/ref/conv.c: * liboil/ref/copy.c: (copy_u8_ref): * liboil/ref/copy8x8.c: (copy8x8_u8_ref): * liboil/ref/permute.c: (permute_test): * liboil/ref/resample.c: (resample_linear_u8_test), (resample_linear_argb_test), (resample_linear_u8_ref), (resample_linear_argb_ref), (merge_linear_argb_test), (merge_linear_argb_ref): * liboil/ref/rgb.c: (rgb2bgr_ref), (rgb2rgba_ref): * liboil/ref/splat.c: (splat_u8_ref), (splat_u32_ref), (splat_u8_ns_ref), (splat_u32_ns_ref): * liboil/ref/tablelookup.c: (tablelookup_u8_ref): * liboil/ref/trans8x8.c: * liboil/ref/yuv.c: (yuyv2ayuv_ref), (yvyu2ayuv_ref), (uyvy2ayuv_ref), (ayuv2yuyv_ref), (ayuv2yvyu_ref), (ayuv2uyvy_ref): * liboil/simdpack/Makefile.am: * liboil/simdpack/abs_i386.c: * liboil/simdpack/abs_powerpc.c: * liboil/simdpack/clip_powerpc.c: * liboil/simdpack/mix_powerpc.c: * liboil/simdpack/mult8x8_i386.c: * liboil/simdpack/multsum_powerpc.c: * liboil/simdpack/sad8x8_powerpc.c: * liboil/simdpack/scalarmult_i386.c: * liboil/simdpack/vectoradd_s_i386.c:
Diffstat (limited to 'liboil/dct')
-rw-r--r--liboil/dct/Makefile.am14
-rw-r--r--liboil/dct/fdct8x8theora_i386.c358
-rw-r--r--liboil/dct/idct8x8_i386.c744
3 files changed, 0 insertions, 1116 deletions
diff --git a/liboil/dct/Makefile.am b/liboil/dct/Makefile.am
index 54d4374..eef1626 100644
--- a/liboil/dct/Makefile.am
+++ b/liboil/dct/Makefile.am
@@ -10,20 +10,6 @@ noinst_LTLIBRARIES = libdct.la $(opt_libs)
noinst_HEADERS = \
dct.h
-if HAVE_CPU_I386
-i386_sources = \
- idct8x8_i386.c \
- fdct8x8theora_i386.c
-else
-i386_sources =
-endif
-
-if HAVE_CPU_AMD64
-amd64_sources =
-else
-amd64_sources =
-endif
-
c_sources = \
dct12_f32.c \
dct36_f32.c \
diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c
deleted file mode 100644
index 7d8bce3..0000000
--- a/liboil/dct/fdct8x8theora_i386.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * LIBOIL - Library of Optimized Inner Loops
- * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*==========================================================================
- *
- * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
- * KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
- * PURPOSE.
- *
- * Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
- *
- *--------------------------------------------------------------------------*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <liboil/liboilfunction.h>
-#include <liboil/liboilfuncs.h>
-#include <liboil/dct/dct.h>
-#include <math.h>
-
-/* FIXME this causes problems on old gcc */
-static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-#if defined(__MINGW32__) || defined(__CYGWIN__) || \
- defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
-# define M(a) "_" #a
-#else
-# define M(a) #a
-#endif
-
-OIL_DECLARE_CLASS(fdct8x8theora);
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
- " movq " #ip0 ", %%mm0 \n\t" \
- " movq " #ip1 ", %%mm1 \n\t" \
- " movq " #ip3 ", %%mm2 \n\t" \
- " movq " #ip5 ", %%mm3 \n\t" \
- " movq %%mm0, %%mm4 \n\t" \
- " movq %%mm1, %%mm5 \n\t" \
- " movq %%mm2, %%mm6 \n\t" \
- " movq %%mm3, %%mm7 \n\t" \
- \
- " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
- " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
- " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
- " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
- " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
- " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
- \
- " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
- \
- " paddsw %%mm2, %%mm2 \n\t" \
- \
- " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
- \
- " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
- " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
- " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
- \
- " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
- " paddsw %%mm7, %%mm7 \n\t" \
- " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
- /* ------------------------------------------------------------------- */ \
- " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
- " paddsw %%mm3, %%mm3 \n\t" \
- \
- " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
- " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
- \
- " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
- " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
- \
- " movq %%mm3, %%mm2 \n\t" \
- " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
- \
- " movq %%mm3, %%mm0 \n\t" \
- " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
- \
- " movq %%mm3," #ip0 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
- " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
- \
- " movq " #temp ", %%mm2 \n\t" \
- " movq %%mm2, %%mm0 \n\t" \
- \
- " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
- " paddw %%mm0, %%mm3 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " movq %%mm5, %%mm0 \n\t" \
- \
- " movq %%mm5, %%mm2 \n\t" \
- " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
- " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
- \
- " movq %%mm5, %%mm0 \n\t" \
- " movq %%mm5, %%mm2 \n\t" \
- \
- " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " movq " #temp ", %%mm3 \n\t" \
- " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
- \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- " movq %%mm3, %%mm2 \n\t" \
- \
- " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- " psubsw %%mm5, %%mm3 \n\t" \
- \
- " movq %%mm3," #ip6 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC4S4)", %%mm0 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- " movq %%mm1, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
- " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
- \
- " movq %%mm7, %%mm2 \n\t" \
- " movq %%mm7, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
- " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
- /* ------------------------------------------------------------------- */ \
- " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
- " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
- \
- " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
- " paddsw %%mm6, %%mm6 \n\t" \
- " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
- \
- " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
- " paddsw %%mm1, %%mm1 \n\t" \
- " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " movq %%mm1, %%mm2 \n\t" \
- \
- " movq %%mm1, %%mm3 \n\t" \
- " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
- " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
- \
- " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
- " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
- \
- " movq %%mm0, %%mm5 \n\t" \
- " movq %%mm0, %%mm2 \n\t" \
- \
- " movq "M(xC1S7)", %%mm7 \n\t" \
- " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
- \
- " movq "M(xC7S1)", %%mm7 \n\t" \
- " psrlw $15, %%mm2 \n\t" \
- \
- " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
- " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
- \
- " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
- " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
- \
- " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
- " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
- \
- " movq %%mm1," #ip1 " \n\t" \
- " movq %%mm3," #ip7 " \n\t" \
- /* ------------------------------------------------------------------- */ \
- " movq "M(xC3S5)", %%mm0 \n\t" \
- " movq "M(xC5S3)", %%mm1 \n\t" \
- \
- " movq %%mm6, %%mm5 \n\t" \
- " movq %%mm6, %%mm7 \n\t" \
- \
- " movq %%mm4, %%mm2 \n\t" \
- " movq %%mm4, %%mm3 \n\t" \
- \
- " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
- \
- " psrlw $15, %%mm2 \n\t" \
- " psrlw $15, %%mm5 \n\t" \
- \
- " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
- " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
- " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
- \
- " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
- " movq %%mm4," #ip3 " \n\t" \
- \
- " movq %%mm3, %%mm4 \n\t" \
- " movq %%mm7, %%mm6 \n\t" \
- \
- " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
- " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
- \
- " paddw %%mm2, %%mm4 \n\t" \
- " paddw %%mm5, %%mm6 \n\t" \
- \
- " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
- " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
- \
- " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
- " movq %%mm3," #ip5 " \n\t"
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
- op0,op1,op2,op3,op4,op5,op6,op7) \
- " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
- " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
- " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
- " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
- " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
- " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
- " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
- " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
- " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
- /* Transpose 2x8 block */ \
- " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
- " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
- " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
- " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
- " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
- " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
- " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
- " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
- " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
- " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
- " movq %%mm4," #op4 " \n\t" \
- " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
- " movq %%mm5," #op5 " \n\t" \
- " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
- " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
- " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
- " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
- " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
- " movq %%mm6," #op7 " \n\t" \
- " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
- " movq %%mm1," #op6 " \n\t" \
- " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
- " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
- " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
- " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
- " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
- " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
- " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
- " movq %%mm0," #op0 " \n\t" \
- " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
- " movq %%mm1," #op1 " \n\t" \
- " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
- " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
- " movq %%mm4," #op3 " \n\t" \
- " movq %%mm2," #op2 " \n\t"
-
-
-static void
-fdct8x8theora_mmx(int16_t *src, int16_t *dest)
-{
- int64_t __attribute__((aligned(8))) align_tmp[16];
- int16_t *const temp= (int16_t*)align_tmp;
-
- __asm__ __volatile__ (
- " .balign 16 \n\t"
- /*
- * Input data is an 8x8 block. To make processing of the data more efficent
- * we will transpose the block of data to two 4x8 blocks???
- */
- Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
- (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
- Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
- Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
- 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
- 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
- Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
- Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
- 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
- Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
- " emms \n\t"
-
- : "+r" (src),
- "+r" (dest)
- : "r" (temp)
- : "memory"
- );
-}
-
-OIL_DEFINE_IMPL_FULL (fdct8x8theora_mmx, fdct8x8theora, OIL_IMPL_FLAG_MMX);
-
diff --git a/liboil/dct/idct8x8_i386.c b/liboil/dct/idct8x8_i386.c
deleted file mode 100644
index e8a88c2..0000000
--- a/liboil/dct/idct8x8_i386.c
+++ /dev/null
@@ -1,744 +0,0 @@
-/*
- * LIBOIL - Library of Optimized Inner Loops
- * Copyright (c) 2004 David A. Schleef <ds@schleef.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <liboil/liboil.h>
-#include <liboil/dct/dct.h>
-#include <math.h>
-
-
-OIL_DECLARE_CLASS (idct8x8_s16);
-OIL_DECLARE_CLASS (dct8x8_s16);
-
-#define CONST(x) (32768.0*(x) + 0.5)
-
-#define C1_0000 (32767)
-#define C0_9808 CONST(0.980785280)
-#define C0_9239 CONST(0.923879532)
-#define C0_8315 CONST(0.831469612)
-#define C0_7071 CONST(0.707106781)
-#define C0_5556 CONST(0.555570233)
-#define C0_3827 CONST(0.382683432)
-#define C0_1951 CONST(0.195090322)
-
-#define FOUR(x) { x, x, x, x }
-#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5}
-
-static const int16_t
-dct_mmx_constants [][4] = {
- FOUR(0),
- FOUR(C0_9808),
- FOUR(C0_9239),
- FOUR(C0_8315),
- FOUR(C0_7071),
- FOUR(C0_5556),
- FOUR(C0_3827),
- FOUR(C0_1951),
- { 1, 1, -1, -1 }, // 64
- { 1, -1, 1, -1 },
- { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80
- { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88
- { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96
- { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104
- { 1, -1, -1, -1 },
- { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120
- { 1, -1, 1, 1 },
- { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136
- { 1, -1, 1, -1 },
- FOUR(CONST(0.5)), //152
- { C0_7071, C0_9239, C0_7071, C0_3827 }, // 160
- { C0_7071, C0_3827, C0_7071, C0_9239 }, // 168
-};
-
-static void
-idct8x8_s16_mmx (int16_t *dest, int dstr, int16_t *src, int sstr)
-{
- int32_t tmp[32];
- int32_t save_ebx;
-
- asm volatile (
- " movl %%ebx, %6 \n"
- /* left half */
- " movl %1, %%eax \n" // src
- " movl %3, %%ebx \n" // sstr
- " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
-
- " movq (%%eax), %%mm0 \n"
- " movq (%%eax), %%mm1 \n"
- " paddsw (%%ecx), %%mm0 \n" // ss07s34
- " psubsw (%%ecx), %%mm1 \n" // ss16s25
- " pmulhw 32(%5), %%mm0 \n" // .7071
- " pmulhw 32(%5), %%mm1 \n" // .7071
-
- " movq (%%eax,%%ebx,2), %%mm2 \n"
- " movq (%%eax,%%ebx,2), %%mm3 \n"
- " movq (%%ecx,%%ebx,2), %%mm4 \n"
- " movq (%%ecx,%%ebx,2), %%mm5 \n"
- " pmulhw 16(%5), %%mm2 \n" // .9239
- " pmulhw 48(%5), %%mm3 \n" // .3827
- " pmulhw 48(%5), %%mm4 \n" // .3827
- " pmulhw 16(%5), %%mm5 \n" // .9239
- " paddsw %%mm4, %%mm2 \n" // ds07s34
- " psubsw %%mm5, %%mm3 \n" // ds16s25
-
- " movq %%mm0, %%mm4 \n"
- " movq %%mm1, %%mm5 \n"
- " paddsw %%mm2, %%mm0 \n" // s07
- " psubsw %%mm2, %%mm4 \n" // s34
- " paddsw %%mm3, %%mm1 \n" // s16
- " psubsw %%mm3, %%mm5 \n" // s25
-
- " movq %%mm0, 0(%4) \n"
- " movq %%mm1, 8(%4) \n"
- " movq %%mm5, 16(%4) \n"
- " movq %%mm4, 24(%4) \n"
-
- " addl %3, %%eax \n"
- " addl %3, %%ecx \n"
-
- " movq (%%eax), %%mm0 \n"
- " pmulhw 8(%5), %%mm0 \n"
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq (%%ecx), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq (%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n" // d07
-
- " movq (%%eax), %%mm2 \n"
- " pmulhw 24(%5), %%mm2 \n"
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n"
- " movq (%%ecx), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n"
- " movq (%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n" // d16
-
- " movq (%%eax), %%mm3 \n"
- " pmulhw 40(%5), %%mm3 \n"
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm3 \n"
- " movq (%%ecx), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm3 \n"
- " movq (%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm3 \n" // d25
-
- " movq (%%eax), %%mm4 \n"
- " pmulhw 56(%5), %%mm4 \n"
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm4 \n"
- " movq (%%ecx), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm4 \n"
- " movq (%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm4 \n" // d34
-
- " movl %0, %%eax \n" // dest
- " movl %2, %%ebx \n" // dstr
- " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3
-
- " movq %%mm0, %%mm1 \n"
- " paddsw 0(%4), %%mm1 \n"
- " movq %%mm1, (%%eax) \n"
-
- " movq %%mm2, %%mm1 \n"
- " paddsw 8(%4), %%mm1 \n"
- " movq %%mm1, (%%eax, %%ebx, 1) \n"
-
- " movq %%mm3, %%mm1 \n"
- " paddsw 16(%4), %%mm1 \n"
- " movq %%mm1, (%%eax, %%ebx, 2) \n" // s25 + d25
-
- " movq %%mm4, %%mm1 \n"
- " paddsw 24(%4), %%mm1 \n"
- " movq %%mm1, (%%eax, %%edx, 1) \n"
-
- " leal (%%eax, %%ebx, 4), %%eax \n"
- " movq 24(%4), %%mm1 \n"
- " psubsw %%mm4, %%mm1 \n"
- " movq %%mm1, (%%eax) \n"
-
- " movq 16(%4), %%mm1 \n"
- " psubsw %%mm3, %%mm1 \n"
- " movq %%mm1, (%%eax, %%ebx, 1) \n"
-
- " movq 8(%4), %%mm1 \n"
- " psubsw %%mm2, %%mm1 \n"
- " movq %%mm1, (%%eax, %%ebx, 2) \n"
-
- " movq 0(%4), %%mm1 \n"
- " psubsw %%mm0, %%mm1 \n"
- " movq %%mm1, (%%eax, %%edx, 1) \n"
-
- /* right half */
- " movl %1, %%eax \n" // src
- " movl %3, %%ebx \n" // sstr
- " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
-
- " movq 8(%%eax), %%mm0 \n"
- " movq 8(%%eax), %%mm1 \n"
- " paddsw 8(%%ecx), %%mm0 \n" // ss07s34
- " psubsw 8(%%ecx), %%mm1 \n" // ss16s25
- " pmulhw 32(%5), %%mm0 \n" // .7071
- " pmulhw 32(%5), %%mm1 \n" // .7071
-
- " movq 8(%%eax,%%ebx,2), %%mm2 \n"
- " movq 8(%%eax,%%ebx,2), %%mm3 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm4 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm5 \n"
- " pmulhw 16(%5), %%mm2 \n" // .9239
- " pmulhw 48(%5), %%mm3 \n" // .3827
- " pmulhw 48(%5), %%mm4 \n" // .3827
- " pmulhw 16(%5), %%mm5 \n" // .9239
- " paddsw %%mm4, %%mm2 \n" // ds07s34
- " psubsw %%mm5, %%mm3 \n" // ds16s25
-
- " movq %%mm0, %%mm4 \n"
- " movq %%mm1, %%mm5 \n"
- " paddsw %%mm2, %%mm0 \n" // s07
- " psubsw %%mm2, %%mm4 \n" // s34
- " paddsw %%mm3, %%mm1 \n" // s16
- " psubsw %%mm3, %%mm5 \n" // s25
-
- " movq %%mm0, 0(%4) \n"
- " movq %%mm1, 8(%4) \n"
- " movq %%mm5, 16(%4) \n"
- " movq %%mm4, 24(%4) \n"
-
- " addl %3, %%eax \n"
- " addl %3, %%ecx \n"
-
- " movq 8(%%eax), %%mm0 \n"
- " pmulhw 8(%5), %%mm0 \n"
- " movq 8(%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 8(%%ecx), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n" // d07
-
- " movq 8(%%eax), %%mm2 \n"
- " pmulhw 24(%5), %%mm2 \n"
- " movq 8(%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n"
- " movq 8(%%ecx), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm2 \n" // d16
-
- " movq 8(%%eax), %%mm3 \n"
- " pmulhw 40(%5), %%mm3 \n"
- " movq 8(%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm3 \n"
- " movq 8(%%ecx), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm3 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm3 \n" // d25
-
- " movq 8(%%eax), %%mm4 \n"
- " pmulhw 56(%5), %%mm4 \n"
- " movq 8(%%eax,%%ebx,2), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm4 \n"
- " movq 8(%%ecx), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm4 \n"
- " movq 8(%%ecx,%%ebx,2), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm4 \n" // d34
-
- " movl %0, %%eax \n" // dest
- " movl %2, %%ebx \n" // dstr
- " leal (%%ebx, %%ebx, 2), %%edx \n" // dstr*3
-
- " movq %%mm0, %%mm1 \n"
- " paddsw 0(%4), %%mm1 \n"
- " movq %%mm1, 8(%%eax) \n"
-
- " movq %%mm2, %%mm1 \n"
- " paddsw 8(%4), %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%ebx, 1) \n"
-
- " movq %%mm3, %%mm1 \n"
- " paddsw 16(%4), %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%ebx, 2) \n" // s25 + d25
-
- " movq %%mm4, %%mm1 \n"
- " paddsw 24(%4), %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%edx, 1) \n"
-
- " leal (%%eax, %%ebx, 4), %%eax \n"
- " movq 24(%4), %%mm1 \n"
- " psubsw %%mm4, %%mm1 \n"
- " movq %%mm1, 8(%%eax) \n"
-
- " movq 16(%4), %%mm1 \n"
- " psubsw %%mm3, %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%ebx, 1) \n"
-
- " movq 8(%4), %%mm1 \n"
- " psubsw %%mm2, %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%ebx, 2) \n"
-
- " movq 0(%4), %%mm1 \n"
- " psubsw %%mm0, %%mm1 \n"
- " movq %%mm1, 8(%%eax, %%edx, 1) \n"
-
-
- /* rows */
- " movl %0, %%eax \n" /* dest */
-#define LOOP \
- " pshufw $0x88, 0(%%eax), %%mm0 \n" /* x0 x2 x0 x2 */ \
- " pshufw $0x88, 8(%%eax), %%mm1 \n" /* x4 x6 x4 x6 */ \
- " pmulhw 160(%5), %%mm0 \n" /* 0.707 0.9239 0.707 0.3827 */ \
- " pmulhw 168(%5), %%mm1 \n" /* 0.707 0.3827 0.707 0.9239 */ \
- " pmullw 64(%5), %%mm1 \n" /* 1 1 -1 -1 */ \
- " paddsw %%mm1, %%mm0 \n" /* ss07s34 ds07s34 ss16s25 ds16s25 */ \
- \
- " pshufw $0xa0, %%mm0, %%mm1 \n" /* ss07s34 ss07s34 ss16s25 ss16s25 */ \
- " pshufw $0xf5, %%mm0, %%mm2 \n" /* ds07s34 ds07s34 ds16s25 ds16s25 */ \
- " pmullw 72(%5), %%mm2 \n" /* 1 -1 1 -1 */ \
- " paddsw %%mm2, %%mm1 \n" /* s07 s34 s16 s25 */ \
- " pshufw $0x78, %%mm1, %%mm2 \n" /* s07 s16 s25 s34 */ \
- \
- " pshufw $0x55, 0(%%eax), %%mm0 \n" \
- " pmulhw 96(%5), %%mm0 \n" \
- " pshufw $0xff, 0(%%eax), %%mm1 \n" \
- " pmulhw 104(%5), %%mm1 \n" \
- " pmullw 112(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm0 \n" \
- " pshufw $0x55, 8(%%eax), %%mm1 \n" \
- " pmulhw 120(%5), %%mm1 \n" \
- " pmullw 128(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm0 \n" \
- " pshufw $0xff, 8(%%eax), %%mm1 \n" \
- " pmulhw 136(%5), %%mm1 \n" \
- " pmullw 144(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm0 \n" \
- \
- " movq %%mm2, %%mm1 \n" \
- " paddsw %%mm0, %%mm1 \n" \
- " psubsw %%mm0, %%mm2 \n" \
- " pshufw $0x1b, %%mm2, %%mm2 \n" \
- \
- " movq %%mm1, 0(%%eax) \n" \
- " movq %%mm2, 8(%%eax) \n" \
- " addl %3, %%eax \n"
-
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
-#undef LOOP
-
- " movl %6, %%ebx \n"
- " emms \n"
- :
- : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants), "m" (save_ebx)
- : "eax", "ecx", "edx");
-}
-OIL_DEFINE_IMPL_FULL (idct8x8_s16_mmx, idct8x8_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-
-#if 0
-#define CONST(x) (32768.0*(x) + 0.5)
-
-#define C1_0000 (32767)
-#define C0_9808 CONST(0.980785280)
-#define C0_9239 CONST(0.923879532)
-#define C0_8315 CONST(0.831469612)
-#define C0_7071 CONST(0.707106781)
-#define C0_5556 CONST(0.555570233)
-#define C0_3827 CONST(0.382683432)
-#define C0_1951 CONST(0.195090322)
-
-#define FOUR(x) { x, x, x, x }
-#define MMX_CONST(x) {32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5,32768.0*(x) + 0.5}
-
-static const int16_t
-dct_mmx_constants [][4] = {
- FOUR(0),
- FOUR(C0_9808),
- FOUR(C0_9239),
- FOUR(C0_8315),
- FOUR(C0_7071),
- FOUR(C0_5556),
- FOUR(C0_3827),
- FOUR(C0_1951),
- { 1, 1, -1, -1 }, // 64
- { 1, -1, 1, -1 },
- { C1_0000, C0_9239, C0_7071, C0_3827 }, // 80
- { C1_0000, C0_3827, C0_7071, C0_9239 }, // 88
- { C0_9808, C0_8315, C0_5556, C0_1951 }, // 96
- { C0_8315, C0_1951, C0_9808, C0_5556 }, // 104
- { 1, -1, -1, -1 },
- { C0_5556, C0_9808, C0_1951, C0_8315 }, // 120
- { 1, -1, 1, 1 },
- { C0_1951, C0_5556, C0_8315, C0_9808 }, // 136
- { 1, -1, 1, -1 },
-};
-#endif
-
-/* a 3dnow version can use pmulhrw instead of pmulhw for increased
- * accuracy */
-static void
-fdct8x8s_s16_mmx (uint16_t *dest, int dstr, uint16_t *src, int sstr)
-{
- int32_t tmp[32];
- int32_t save_ebx;
-
- asm volatile (
- " movl %%ebx, %6 \n"
- /* first half */
- " movl %1, %%eax \n" // src
- " movl %3, %%ebx \n" // sstr
- " leal (%%ebx,%%ebx,2),%%edx \n" // sstr * 3
- " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
-
- " movq (%%eax), %%mm0 \n"
- " movq (%%eax), %%mm1 \n"
- " paddsw (%%ecx,%%edx,1), %%mm0 \n" // s07
- " psubsw (%%ecx,%%edx,1), %%mm1 \n" // d07
- " movq %%mm1, (%4) \n"
-
- " movq (%%eax,%%ebx), %%mm2 \n"
- " movq (%%eax,%%ebx), %%mm3 \n"
- " paddsw (%%ecx,%%ebx,2), %%mm2 \n" // s16
- " psubsw (%%ecx,%%ebx,2), %%mm3 \n" // d16
- " movq %%mm3, 8(%4) \n"
-
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " movq (%%eax,%%ebx,2), %%mm4 \n"
- " paddsw (%%ecx,%%ebx), %%mm1 \n" // s25
- " psubsw (%%ecx,%%ebx), %%mm4 \n" // d25
- " movq %%mm4, 16(%4) \n"
-
- " movq (%%eax,%%edx), %%mm3 \n"
- " movq (%%eax,%%edx), %%mm5 \n"
- " paddsw (%%ecx), %%mm3 \n" // s34
- " psubsw (%%ecx), %%mm5 \n" // d34
- " movq %%mm5, 24(%4) \n"
-
- " movq %%mm0, %%mm4 \n"
- " paddsw %%mm3, %%mm0 \n" // ss07s34
- " psubsw %%mm3, %%mm4 \n" // ds07s34
-
- " movq %%mm2, %%mm5 \n"
- " paddsw %%mm1, %%mm2 \n" // ss16s25
- " psubsw %%mm1, %%mm5 \n" // ds16s25
-
- " movq %%mm0, %%mm1 \n"
- " paddsw %%mm2, %%mm1 \n"
- " pmulhw 32(%5), %%mm1 \n" // .7071
- " psubsw %%mm2, %%mm0 \n"
- " pmulhw 32(%5), %%mm0 \n" // .7071
-
- " movq %%mm4, %%mm2 \n"
- " pmulhw 16(%5), %%mm2 \n"
- " movq %%mm5, %%mm6 \n"
- " pmulhw 48(%5), %%mm6 \n"
- " paddsw %%mm6, %%mm2 \n" //
-
- " pmulhw 48(%5), %%mm4 \n"
- " pmulhw 16(%5), %%mm5 \n"
- " psubsw %%mm5, %%mm4 \n" //
-
- " movl %0, %%eax \n" // dest
- " movl %2, %%ebx \n" // dstr
- " add %%ebx, %%ebx \n"
- " leal (%%ebx,%%ebx,2),%%edx \n" // dstr * 3
- " movq %%mm1, 0(%%eax) \n"
- " movq %%mm2, 0(%%eax,%%ebx) \n"
- " movq %%mm0, 0(%%eax,%%ebx,2) \n"
- " movq %%mm4, 0(%%eax,%%edx) \n"
-
- " add %2, %%eax \n"
- " movq 0(%4), %%mm0 \n"
- " pmulhw 8(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 24(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%ebx) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 40(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%ebx,2) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 56(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%edx) \n"
-
- /* second half */
-
- " movl %1, %%eax \n" // src
- " add $8, %%eax \n"
- " movl %3, %%ebx \n" // sstr
- " leal (%%ebx,%%ebx,2),%%edx \n" // sstr * 3
- " leal (%%eax,%%ebx,4),%%ecx \n" // src + sstr * 4
-
- " movq (%%eax), %%mm0 \n"
- " movq (%%eax), %%mm1 \n"
- " paddsw (%%ecx,%%edx,1), %%mm0 \n" // s07
- " psubsw (%%ecx,%%edx,1), %%mm1 \n" // d07
- " movq %%mm1, (%4) \n"
-
- " movq (%%eax,%%ebx), %%mm2 \n"
- " movq (%%eax,%%ebx), %%mm3 \n"
- " paddsw (%%ecx,%%ebx,2), %%mm2 \n" // s16
- " psubsw (%%ecx,%%ebx,2), %%mm3 \n" // d16
- " movq %%mm3, 8(%4) \n"
-
- " movq (%%eax,%%ebx,2), %%mm1 \n"
- " movq (%%eax,%%ebx,2), %%mm4 \n"
- " paddsw (%%ecx,%%ebx), %%mm1 \n" // s25
- " psubsw (%%ecx,%%ebx), %%mm4 \n" // d25
- " movq %%mm4, 16(%4) \n"
-
- " movq (%%eax,%%edx), %%mm3 \n"
- " movq (%%eax,%%edx), %%mm5 \n"
- " paddsw (%%ecx), %%mm3 \n" // s34
- " psubsw (%%ecx), %%mm5 \n" // d34
- " movq %%mm5, 24(%4) \n"
-
- " movq %%mm0, %%mm4 \n"
- " paddsw %%mm3, %%mm0 \n" // ss07s34
- " psubsw %%mm3, %%mm4 \n" // ds07s34
-
- " movq %%mm2, %%mm5 \n"
- " paddsw %%mm1, %%mm2 \n" // ss16s25
- " psubsw %%mm1, %%mm5 \n" // ds16s25
-
- " movq %%mm0, %%mm1 \n"
- " paddsw %%mm2, %%mm1 \n"
- " pmulhw 32(%5), %%mm1 \n" // .7071
- " psubsw %%mm2, %%mm0 \n"
- " pmulhw 32(%5), %%mm0 \n" // .7071
-
- " movq %%mm4, %%mm2 \n"
- " pmulhw 16(%5), %%mm2 \n"
- " movq %%mm5, %%mm6 \n"
- " pmulhw 48(%5), %%mm6 \n"
- " paddsw %%mm6, %%mm2 \n" //
-
- " pmulhw 48(%5), %%mm4 \n"
- " pmulhw 16(%5), %%mm5 \n"
- " psubsw %%mm5, %%mm4 \n" //
-
- " movl %0, %%eax \n" // dest
- " add $8, %%eax \n"
- " movl %2, %%ebx \n" // dstr
- " add %%ebx, %%ebx \n"
- " leal (%%ebx,%%ebx,2),%%edx \n" // dstr * 3
- " movq %%mm1, 0(%%eax) \n"
- " movq %%mm2, 0(%%eax,%%ebx) \n"
- " movq %%mm0, 0(%%eax,%%ebx,2) \n"
- " movq %%mm4, 0(%%eax,%%edx) \n"
-
- " add %2, %%eax \n"
- " movq 0(%4), %%mm0 \n"
- " pmulhw 8(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 24(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%ebx) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 40(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 56(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%ebx,2) \n"
-
- " movq 0(%4), %%mm0 \n"
- " pmulhw 56(%5), %%mm0 \n"
- " movq 8(%4), %%mm1 \n"
- " pmulhw 40(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq 16(%4), %%mm1 \n"
- " pmulhw 24(%5), %%mm1 \n"
- " paddsw %%mm1, %%mm0 \n"
- " movq 24(%4), %%mm1 \n"
- " pmulhw 8(%5), %%mm1 \n"
- " psubsw %%mm1, %%mm0 \n"
- " movq %%mm0, (%%eax,%%edx) \n"
-
- " movl %0, %%ecx \n" // dest
-
-#define LOOP \
- " movq (%%ecx), %%mm0 \n" \
- " pshufw $0x1b, 8(%%ecx), %%mm1 \n" \
- " movq %%mm0, %%mm2 \n" \
- " paddsw %%mm1, %%mm0 \n" /* s07 s16 s25 s34 */ \
- " psubsw %%mm1, %%mm2 \n" /* d07 d16 d25 d34 */ \
- \
- " pshufw $0xbb, %%mm0, %%mm1 \n" /* s25 s34 s25 s34 */ \
- " pshufw $0x44, %%mm0, %%mm0 \n" /* s07 s16 s07 s16 */ \
- \
- " pmullw 64(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm0 \n" /* ss07s34 ss16s25 ds07s34 ds16s25 */ \
- \
- " pshufw $0x88, %%mm0, %%mm1 \n" /* ss07s34 ds07s34 ss07s34 ds07s34 */ \
- " pshufw $0xdd, %%mm0, %%mm0 \n" /* ss16s25 ds16s25 ss16s25 ds16s25 */ \
- \
- " pmulhw 80(%5), %%mm1 \n" \
- \
- " pmullw 64(%5), %%mm0 \n" \
- " pmulhw 88(%5), %%mm0 \n" \
- \
- " paddsw %%mm1, %%mm0 \n" \
- \
- " pshufw $0x00, %%mm2, %%mm3 \n" \
- " pmulhw 96(%5), %%mm3 \n" \
- " pshufw $0x55, %%mm2, %%mm1 \n" \
- " pmulhw 104(%5), %%mm1 \n" \
- " pmullw 112(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm3 \n" \
- " pshufw $0xaa, %%mm2, %%mm1 \n" \
- " pmulhw 120(%5), %%mm1 \n" \
- " pmullw 128(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm3 \n" \
- " pshufw $0xff, %%mm2, %%mm1 \n" \
- " pmulhw 136(%5), %%mm1 \n" \
- " pmullw 144(%5), %%mm1 \n" \
- " paddsw %%mm1, %%mm3 \n" \
- \
- " movq %%mm0, %%mm1 \n" \
- " punpckhwd %%mm3, %%mm1 \n" \
- " punpcklwd %%mm3, %%mm0 \n" \
- \
- " movq %%mm0, (%%ecx) \n" \
- " movq %%mm1, 8(%%ecx) \n" \
- \
- " add %3, %%eax \n" \
- " add %2, %%ecx \n"
-
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
- LOOP
-
- " emms \n"
- " movl %6, %%ebx \n"
- :
- : "m" (dest), "m" (src), "m" (dstr), "m" (sstr), "r" (tmp), "r" (dct_mmx_constants), "m" (save_ebx)
- : "eax", "ecx", "edx");
-
-}
-OIL_DEFINE_IMPL_FULL (fdct8x8s_s16_mmx, fdct8x8s_s16, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT);
-